#Replication code for:
#Isaac D. Mehlhaff. "A Group-Based Approach to Measuring Polarization." American Political Science Review. September 2023.
###################################################################################
#setup
###################################################################################

#install and load necessary packages

packages <- c("labelled", "pbapply", "doParallel", "doRNG", "psych", "cluster", "MASS", "prefmod",
              "stargazer", "cowplot", "grid", "gtools", "gtable", "latex2exp", "car", "geosphere",
              "CPC", "tidyverse")

for (i in packages) {
  if(i %in% installed.packages()) {
    print("Package already installed")
  }
  
  else {
    install.packages(i, dependencies = TRUE)
  }
  
  library(i, character.only = TRUE)
}

library(labelled)
library(pbapply)
library(doParallel)
library(doRNG)
library(psych)
library(cluster)
library(MASS)
library(prefmod)
library(stargazer)
library(cowplot)
library(grid)
library(gtools)
library(gtable)
library(latex2exp)
library(car)
library(geosphere)
library(CPC)
library(tidyverse)

#stargazer <= 5.2.3 has a known issue in R >= 4.2. If the stargazer calls below throw an error, uncomment and run the following code to implement a workaround for this issue.

#source(stargazer_fix.R)

#if necessary, set working directory to wherever the data files are saved

#setwd()

#set ggplot theme

theme_set(theme_bw(base_size = 22))

#set number of cores for parallel processing

cores <- detectCores()

#import data

load("cses_imd.rdata")
CSEStrenddata <- as.data.frame(cses_imd)
val_labels(CSEStrenddata) <- NULL

nominate <- as.data.frame(read.csv("HSall_members.csv"))
nominate_parties <- as.data.frame(read.csv("NOMINATE_parties.csv"))

brookings <- as.data.frame(read.csv("party_unity_scores.csv"))

load("results-elites-germany.rdata")
germany <- results
load("results-elites-italy.rdata")
italy <- results
load("results-elites-NL.rdata")
netherlands <- results
load("results-elites-spain.rdata")
spain <- results
load("results-elites-UK.rdata")
UK <- results
load("results-elites-US.rdata")
US <- results
rm(results)

lit <- as.data.frame(read.csv("literature_analysis.csv"))

load("label_dist.RData")

label_data <- as.data.frame(read.csv("label_data.csv"))
label_data <- label_data[,-1]

vdem <- as.data.frame(read.csv("V-Dem-CPD-Party-V2.csv"))

###################################################################################
#functions
###################################################################################

#write function to cluster observations and calculate distance

diff.uni <- function(x, y) {
  input <- na.omit(matrix(x, ncol = y))
  
  if(length(unique(input)) < 2){
    return(NA)
  }
  
  else {
    output_kmeans <- kmeans(x = input, centers = 2, nstart = 30)
    diff <- abs(as.numeric(output_kmeans$centers[1, 1] - output_kmeans$centers[2, 1]))
    return(diff)
  }
}

#write function to shift legend into empty space in plot

shift_legend <- function(p) {
  
  # check if p is a valid object
  if(!"gtable" %in% class(p)){
    if("ggplot" %in% class(p)){
      gp <- ggplotGrob(p) # convert to grob
    } else {
      message("This is neither a ggplot object nor a grob generated from ggplotGrob. Returning original plot.")
      return(p)
    }
  } else {
    gp <- p
  }
  
  # check for unfilled facet panels
  facet.panels <- grep("^panel", gp[["layout"]][["name"]])
  empty.facet.panels <- sapply(facet.panels, function(i) "zeroGrob" %in% class(gp[["grobs"]][[i]]))
  empty.facet.panels <- facet.panels[empty.facet.panels]
  if(length(empty.facet.panels) == 0){
    message("There are no unfilled facet panels to shift legend into. Returning original plot.")
    return(p)
  }
  
  # establish extent of unfilled facet panels (including any axis cells in between)
  empty.facet.panels <- gp[["layout"]][empty.facet.panels, ]
  empty.facet.panels <- list(min(empty.facet.panels[["t"]]), min(empty.facet.panels[["l"]]),
                             max(empty.facet.panels[["b"]]), max(empty.facet.panels[["r"]]))
  names(empty.facet.panels) <- c("t", "l", "b", "r")
  
  # extract legend & copy over to location of unfilled facet panels
  guide.grob <- which(gp[["layout"]][["name"]] == "guide-box")
  if(length(guide.grob) == 0){
    message("There is no legend present. Returning original plot.")
    return(p)
  }
  gp <- gtable_add_grob(x = gp,
                        grobs = gp[["grobs"]][[guide.grob]],
                        t = empty.facet.panels[["t"]],
                        l = empty.facet.panels[["l"]],
                        b = empty.facet.panels[["b"]],
                        r = empty.facet.panels[["r"]],
                        name = "new-guide-box")
  
  # squash the original guide box's row / column (whichever applicable)
  # & empty its cell
  guide.grob <- gp[["layout"]][guide.grob, ]
  if(guide.grob[["l"]] == guide.grob[["r"]]){
    gp <- gtable_squash_cols(gp, cols = guide.grob[["l"]])
  }
  if(guide.grob[["t"]] == guide.grob[["b"]]){
    gp <- gtable_squash_rows(gp, rows = guide.grob[["t"]])
  }
  gp <- gtable_remove_grobs(gp, "guide-box")
  
  return(gp)
}

###################################################################################
#data cleaning
###################################################################################

#select and rename relevant variables

cses <- filter(CSEStrenddata, IMD1008_MOD_4 == 1,
               !IMD1006_NAM %in% c("Hong Kong", "Philippines", "Taiwan", "Thailand", "Turkey",
                                   "Brazil", "Serbia", "Republic of Korea", "Latvia", "Mexico")) %>%
  dplyr::select(country = IMD1006_NAM, year = IMD1013_Y, party = IMD3005_3, affect1 = IMD3008_A,
                affect2 = IMD3008_B, affect3 = IMD3008_C, affect4 = IMD3008_D, affect5 = IMD3008_E,
                affect6 = IMD3008_F, affect7 = IMD3008_G, affect8 = IMD3008_H, affect9 = IMD3008_I,
                vote_up1 = IMD5003_A, vote_up2 = IMD5003_B, vote_up3 = IMD5003_C,
                vote_up4 = IMD5003_D, vote_up5 = IMD5004_E, vote_up6 = IMD5003_F,
                vote_up7 = IMD5003_G, vote_up8 = IMD5003_H, vote_up9 = IMD5003_I,
                vote_low1 = IMD5001_A, vote_low2 = IMD5001_B, vote_low3 = IMD5001_C,
                vote_low4 = IMD5001_D, vote_low5 = IMD5004_E, vote_low6 = IMD5001_F,
                vote_low7 = IMD5001_G, vote_low8 = IMD5001_H, vote_low9 = IMD5001_I,
                efficacy1 = IMD3011, efficacy2 = IMD3012, close = IMD3005_4, extreme = IMD3006) %>%
  rowwise() %>%
  mutate(country = ifelse(country == "United States of America", "United States",
                          ifelse(country == "Great Britain", "United Kingdom",
                                 ifelse(country == "Republic of Korea", "South Korea", country))),
         efficacy1 = ifelse(efficacy1 %in% c(1:5), efficacy1, NA),
         efficacy2 = ifelse(efficacy2 %in% c(1:5), efficacy2, NA),
         close = ifelse(close == 1, 3,
                        ifelse(close == 2, 2,
                               ifelse(close == 3, 1, NA))),
         extreme = ifelse(extreme %in% c(0, 10), 5,
                          ifelse(extreme %in% c(1, 9), 4,
                                 ifelse(extreme %in% c(2, 8), 3,
                                        ifelse(extreme %in% c(3, 7), 2,
                                               ifelse(extreme %in% c(4, 6), 1,
                                                      ifelse(extreme == 5, 0, NA)))))),
         vote1 = ifelse(vote_up1 < 100, vote_up1,
                        ifelse(vote_low1 < 100, vote_low1, NA)),
         vote2 = ifelse(vote_up2 < 100, vote_up2,
                        ifelse(vote_low2 < 100, vote_low2, NA)),
         vote3 = ifelse(vote_up3 < 100, vote_up3,
                        ifelse(vote_low3 < 100, vote_low3, NA)),
         vote4 = ifelse(vote_up4 < 100, vote_up4,
                        ifelse(vote_low4 < 100, vote_low4, NA)),
         vote5 = ifelse(vote_up5 < 100, vote_up5,
                        ifelse(vote_low5 < 100, vote_low5, NA)),
         vote6 = ifelse(vote_up6 < 100, vote_up6,
                        ifelse(vote_low6 < 100, vote_low6, NA)),
         vote7 = ifelse(vote_up7 < 100, vote_up7,
                        ifelse(vote_low7 < 100, vote_low7, NA)),
         vote8 = ifelse(vote_up8 < 100, vote_up8,
                        ifelse(vote_low8 < 100, vote_low8, NA)),
         vote9 = ifelse(vote_up9 < 100, vote_up9,
                        ifelse(vote_low9 < 100, vote_low9, NA)),
         affect7 = ifelse(country == "Argentina" & affect7 %in% c(0:10), affect7, NA),
         affect8 = ifelse(country == "Argentina" & affect8 %in% c(0:10), affect8, NA),
         affect2 = ifelse(country == "Argentina",
                          sum(affect7, affect8, na.rm = TRUE)/sum(!is.na(affect7),
                                                                  !is.na(affect8)), affect2),
         affect1 = ifelse(affect1 %in% c(0:10) & vote1 >= 10, affect1, NA),
         affect2 = ifelse(affect2 %in% c(0:10) & vote2 >= 10, affect2, NA),
         affect3 = ifelse(affect3 %in% c(0:10) & vote3 >= 10, affect3, NA),
         affect4 = ifelse(affect4 %in% c(0:10) & vote4 >= 10, affect4, NA),
         affect5 = ifelse(affect5 %in% c(0:10) & vote5 >= 10, affect5, NA),
         affect6 = ifelse(affect6 %in% c(0:10) & vote6 >= 10, affect6, NA),
         affect7 = ifelse(affect7 %in% c(0:10) & vote7 >= 10, affect7, NA),
         affect8 = ifelse(affect8 %in% c(0:10) & vote8 >= 10, affect8, NA),
         affect9 = ifelse(affect9 %in% c(0:10) & vote9 >= 10, affect9, NA),
         party = ifelse(party %in% c(320007, 320008), 320002,
                        ifelse(party %in% c(9999988, 9999989, 9999990, 9999991, 9999992, 9999997,
                                            9999998, 9999999), 9999999, party)),
         year = ifelse(country == "Canada" & year == 9999 & vote1 == 39.63, 2011,
                       ifelse(country == "Canada" & year == 9999 & vote1 == 39.47, 2015,
                              ifelse(country == "New Zealand" & year == 2012, 2011,
                                     ifelse(country == "New Zealand" & year == 2015, 2014,
                                            ifelse(country == "New Zealand" & year == 9999 & vote1 == 47.31, 2011,
                                                   ifelse(country == "New Zealand" & year == 9999 & vote1 == 47.04, 2014,
                                                          ifelse(country == "Norway" & year %in% c(2014, 9999), 2013,
                                                                 ifelse(country == "South Korea" & year == 9999, 2016,
                                                                        ifelse(country == "United States" & year %in% c(2013, 9999), 2012,
                                                                               ifelse(country == "United States" & year == 2017, 2016,
                                                                                      ifelse(country == "Australia" & year == 2014, 2013,
                                                                                             ifelse(country %in% c("Romania", "Serbia") & year == 2012, 2013, year)))))))))))),
         cy = paste0(country, year)) %>%
  dplyr::select(-starts_with("vote_")) %>%
  filter(!is.na(party), !is.na(year), !is.na(vote1), !cy %in% c("Canada2011", "Greece2013",
                                                                "Latvia2011", "Mexico2012",
                                                                "New Zealand2011"))

#remove observations from uncompleted (at time of analysis) 117th Congress and select necessary variables

nominate <- filter(nominate, congress %in% c(1:116)) %>%
  dplyr::select(congress, chamber, party_code, nominate_dim1, nominate_dim2, nokken_poole_dim1, nokken_poole_dim2) %>%
  left_join(., nominate_parties) %>%
  select(-party_code) %>%
  filter(!is.na(party))

#break apart into House and Senate (eliminates President observations)

nom_house_raw <- filter(nominate, chamber == "House")
nom_senate_raw <- filter(nominate, chamber == "Senate")

#assign non-major party observations to party clusters

nom_house <- as.data.frame(matrix(NA, ncol = 8))
nom_senate <- as.data.frame(matrix(NA, ncol = 8))

colnames(nom_house) <- c(colnames(nom_house_raw)[1:6], "party1", "party2")
colnames(nom_senate) <- c(colnames(nom_senate_raw)[1:6], "party1", "party2")

for (t in unique(nom_house_raw$congress)) {
  temp <- filter(nom_house_raw, congress == t)
  input <- na.omit(temp)

  centers1 <- matrix(c(mean(subset(input,
                                   party == names(sort(table(input$party),
                                                       decreasing = TRUE)[1]))[,3]),
                       mean(subset(input,
                                   party == names(sort(table(input$party),
                                                       decreasing = TRUE)[2]))[,3])),
                     ncol = 1, byrow = TRUE)
  centers2 <- matrix(c(mean(subset(input,
                                   party == names(sort(table(input$party),
                                                       decreasing = TRUE)[1]))[,3]),
                       mean(subset(input,
                                   party == names(sort(table(input$party),
                                                       decreasing = TRUE)[1]))[,4]),
                       mean(subset(input,
                                   party == names(sort(table(input$party),
                                                       decreasing = TRUE)[2]))[,3]),
                       mean(subset(input,
                                   party == names(sort(table(input$party),
                                                       decreasing = TRUE)[2]))[,4])),
                     ncol = 2, byrow = TRUE)
  
  distance1 <- matrix(NA, nrow = nrow(input), ncol = nrow(centers1))
  distance2 <- matrix(NA, nrow = nrow(input), ncol = nrow(centers2))
  
  dm1 <- apply(data.frame(1:nrow(centers1)), 1, function(x) {
    replace(distance1[,x], 1:nrow(input), centers1[x,] - input[,3])
  })
  dm2 <- apply(data.frame(1:nrow(centers2)), 1, function(x) {
    replace(distance2[,x], 1:nrow(input), distGeo(centers2[x,], input[,3:4]))
  })
  
  input$party1 <- mutate(as.data.frame(dm1), party1 = ifelse(abs(V1) < abs(V2), 1, 2)) %>%
    select(party1) %>%
    unlist()
  input$party2 <- apply(dm2, 1, which.min)
  
  input <- select(input, -party)
  
  input$congress <- t
  
  nom_house <- rbind(nom_house, input)
}

for (t in unique(nom_senate_raw$congress)) {
  temp <- filter(nom_senate_raw, congress == t)
  input <- na.omit(temp)
  
  centers1 <- matrix(c(mean(subset(input,
                                   party == names(sort(table(input$party),
                                                       decreasing = TRUE)[1]))[,3]),
                       mean(subset(input,
                                   party == names(sort(table(input$party),
                                                       decreasing = TRUE)[2]))[,3])),
                     ncol = 1, byrow = TRUE)
  centers2 <- matrix(c(mean(subset(input,
                                   party == names(sort(table(input$party),
                                                       decreasing = TRUE)[1]))[,3]),
                       mean(subset(input,
                                   party == names(sort(table(input$party),
                                                       decreasing = TRUE)[1]))[,4]),
                       mean(subset(input,
                                   party == names(sort(table(input$party),
                                                       decreasing = TRUE)[2]))[,3]),
                       mean(subset(input,
                                   party == names(sort(table(input$party),
                                                       decreasing = TRUE)[2]))[,4])),
                     ncol = 2, byrow = TRUE)
  
  distance1 <- matrix(NA, nrow = nrow(input), ncol = nrow(centers1))
  distance2 <- matrix(NA, nrow = nrow(input), ncol = nrow(centers2))
  
  dm1 <- apply(data.frame(1:nrow(centers1)), 1, function(x) {
    replace(distance1[,x], 1:nrow(input), centers1[x,] - input[,3])
  })
  dm2 <- apply(data.frame(1:nrow(centers2)), 1, function(x) {
    replace(distance2[,x], 1:nrow(input), distGeo(centers2[x,], input[,3:4]))
  })
  
  input$party1 <- mutate(as.data.frame(dm1), party1 = ifelse(abs(V1) < abs(V2), 1, 2)) %>%
    select(party1) %>%
    unlist()
  input$party2 <- apply(dm2, 1, which.min)
  
  input <- select(input, -party)
  
  input$congress <- t
  
  nom_senate <- rbind(nom_senate, input)
}

nom_house <- nom_house[-1,]
nom_senate <- nom_senate[-1,]

#add congresses to party unity data frame and restrict to full parties after 1975

brookings <- filter(brookings, Year >= 1975, Party != "Southern Democrats") %>%
  rename(chamber = Chamber) %>%
  mutate(Score = as.numeric(Score),
         chamber = ifelse(chamber == "House", "House of Representatives", chamber)) %>%
  add_column(congress = rep(rep(94:114, each = 2), 4), .after = ncol(.)) %>%
  group_by(chamber, congress) %>%
  summarise(unity = mean(Score, na.rm = TRUE)) %>%
  ungroup()

#prepare Twitter data for merge

germany <- mutate(germany, Country = "Germany")
italy <- mutate(italy, Country = "Italy")
netherlands <- mutate(netherlands, Country = "Netherlands")
spain <- mutate(spain, Country = "Spain")
UK <- mutate(UK, Country = "United Kingdom")
US <- mutate(US, Country = "United States", alpha.sd = NA)

#merge Twitter data

twitter <- rbind(germany, italy, netherlands, spain, UK, US)

#select relevant VDem variables and limit to countries in Twitter data

vdem <- dplyr::select(vdem, country = country_name, year, party = v2paid, vote = v2pavote,
                      lr = v2pariglef_osp, lr_sd = v2pariglef_osp_sd) %>%
  mutate(country = ifelse(country == "United States of America", "United States", country),
         vote = 0.01*vote,
         cy = paste0(country, year)) %>%
  filter(cy %in% c("Germany2013", "Italy2013", "Netherlands2012", "Spain2011", "United Kingdom2015",
                   "United States2012")) %>%
  dplyr::select(-cy)

###################################################################################
#analysis of existing literature
###################################################################################

#total number of articles

nrow(lit)

#number of articles in each publication (Table S1)

table(lit$publication)

#count number of articles with no operationalization

sum(lit$none, na.rm = TRUE)

#calculate percentage of articles using each method in each journal

lit_summary <- group_by(lit, publication) %>%
  dplyr::select(-title, -author, -year) %>%
  dplyr::summarise(n = n(),
                   difference = sum(difference_distance, na.rm = TRUE)/n,
                   variance = sum(variance_homogeneity, na.rm = TRUE)/n,
                   bimodality = sum(bimodality, na.rm = TRUE)/n,
                   overlap = sum(overlap, na.rm = TRUE)/n,
                   importance = sum(importance, na.rm = TRUE)/n,
                   party_unity = sum(party_unity, na.rm = TRUE)/n,
                   correlation = sum(correlation, na.rm = TRUE)/n,
                   r_squared = sum(r_squared, na.rm = TRUE)/n,
                   party_votes = sum(party_votes, na.rm = TRUE)/n,
                   seat_proportion = sum(seat_proportion, na.rm = TRUE)/n,
                   regression_coef = sum(regression_coef, na.rm = TRUE)/n,
                   same_party_clerk = sum(same_party_clerk, na.rm = TRUE)/n,
                   share_extreme = sum(share_extreme, na.rm = TRUE)/n,
                   party_control = sum(party_control, na.rm = TRUE)/n,
                   social_distance = sum(social_distance, na.rm = TRUE)/n,
                   coalition_size = sum(coalition_size, na.rm = TRUE)/n,
                   network_separation = sum(network_separation, na.rm = TRUE)/n,
                   outparty_opinion = sum(outparty_opinion, na.rm = TRUE)/n,
                   time = sum(time, na.rm = TRUE)/n,
                   engagement = sum(engagement, na.rm = TRUE)/n,
                   unspecified = sum(unspecified, na.rm = TRUE)/n,
                   none = sum(none, na.rm = TRUE)/n) %>%
  dplyr::select(-n, -none)

#calculate percentage of articles using each method overall

lit_overall <- dplyr::select(lit, -title, -author, -year, -publication) %>%
  dplyr::summarise(publication = "Overall",
                   n = n(),
                   difference = sum(difference_distance, na.rm = TRUE)/n,
                   variance = sum(variance_homogeneity, na.rm = TRUE)/n,
                   bimodality = sum(bimodality, na.rm = TRUE)/n,
                   overlap = sum(overlap, na.rm = TRUE)/n,
                   importance = sum(importance, na.rm = TRUE)/n,
                   party_unity = sum(party_unity, na.rm = TRUE)/n,
                   correlation = sum(correlation, na.rm = TRUE)/n,
                   r_squared = sum(r_squared, na.rm = TRUE)/n,
                   party_votes = sum(party_votes, na.rm = TRUE)/n,
                   seat_proportion = sum(seat_proportion, na.rm = TRUE)/n,
                   regression_coef = sum(regression_coef, na.rm = TRUE)/n,
                   same_party_clerk = sum(same_party_clerk, na.rm = TRUE)/n,
                   share_extreme = sum(share_extreme, na.rm = TRUE)/n,
                   party_control = sum(party_control, na.rm = TRUE)/n,
                   social_distance = sum(social_distance, na.rm = TRUE)/n,
                   coalition_size = sum(coalition_size, na.rm = TRUE)/n,
                   network_separation = sum(network_separation, na.rm = TRUE)/n,
                   outparty_opinion = sum(outparty_opinion, na.rm = TRUE)/n,
                   time = sum(time, na.rm = TRUE)/n,
                   engagement = sum(engagement, na.rm = TRUE)/n,
                   unspecified = sum(unspecified, na.rm = TRUE)/n,
                   none = sum(none, na.rm = TRUE)/n) %>%
  dplyr::select(-n, -none)

#combine data frames, reorder, and round

lit_summary <- as.data.frame(t(rbind(lit_summary[2,], lit_summary[1,], lit_summary[5,],
                                     lit_summary[3,], lit_summary[8,], lit_summary[6,],
                                     lit_summary[4,], lit_summary[7,], lit_overall)))

colnames(lit_summary) <- lit_summary[1,]

lit_summary <- lit_summary[2:nrow(lit_summary),]

lit_summary <- lit_summary[order(lit_summary$Overall, decreasing = TRUE),]

lit_summary <- apply(lit_summary, 2, function(x) round(as.numeric(x), 3))

lit_summary <- cbind(c("Difference", "Variance", "Bimodality", "Overlap", "Correlation",
                       "Social Distance", "Regression Coefficient", "Proportion Extreme", "Time",
                       "Party Unity", "Party Votes", "Divided Government", "Coalition Size",
                       "Importance", "R-Squared", "Seat Proportion", "Same-Party Clerk",
                       "Network Separation", "Outparty Opinion", "Engagement", "Unspecified"),
                     lit_summary)

#print table of proportions (Table S2)

stargazer(lit_summary, summary = FALSE, title = "Proportion of Articles Using Polarization Measures")

###################################################################################
#silhouette scores for Twitter polarization estimates
###################################################################################

#calculate silhouette scores for each country with increasing k
 
K <- rep(seq(from = 1, to = 10, by = 1), 6)
sils <- c()

#calculate silhouette scores for each country with increasing k

for (i in unique(twitter$Country)) {
  kmeans_data <- dplyr::filter(twitter, Country == i)
  
  for (k in 1:10) {
    if(k == 1) {
      sils[length(sils) + 1] <- 0
      print(k)
    }
    
    else {
      kmeans_results <- kmeans(na.omit(kmeans_data$phi), centers = k, nstart = 30)
      kmeans_sils <- silhouette(kmeans_results$cluster, dist = dist(kmeans_data$phi))
      sils[length(sils) + 1] <- mean(kmeans_sils[,3])
      print(k)
    }
  }
  print(i)
}

#create silhouette plots by country

sil <- as.data.frame(matrix(data = c(K, sils), nrow = 60, ncol = 2, byrow = FALSE))
colnames(sil) <- c("K", "Silhouette Score")

sil <- cbind(sil, Country = c(rep(unique(twitter$Country), each = 10)))
sil$Country <- factor(sil$Country, levels = c("Germany", "Italy", "United States", "Netherlands",
                                              "United Kingdom", "Spain"))

#plot silhouette scores (Figure R5)

pdf("figureR5.pdf", width = 9, height = 7)

ggplot(sil, aes(x = K, y = `Silhouette Score`)) +
  geom_point(size = 1) +
  geom_line() +
  facet_wrap(~ Country) +
  scale_x_continuous(breaks = c(2, 4, 6, 8, 10)) +
  labs(x = "Number of Clusters") +
  theme(strip.background = element_rect(color = "white", fill = "white"))

dev.off()

#print full table of silhouette scores (Table R1)

sil <- cbind(sil[,c(1, 3)], round(sil[,2], 3))

print(sil)

###################################################################################
#calculation of Twitter polarization estimates
###################################################################################

#set seed and number of sets

set.seed(999)
sets <- 100

#bootstrap data sets and calculate polarization estimates for each country

registerDoParallel(cores = cores)

germany_boot <- foreach(i = 1:sets) %dorng% {
  data <- sample_n(tbl = germany, size = nrow(germany), replace = TRUE)
  germany_polarization <- summarise(data, CPC = CPC(phi, "kmeans", k = 2, adjust = TRUE, nstart = 30),
                                    CPC_hclust = CPC(phi, "hclust", k = 2, adjust = TRUE),
                                    CPC_diana = CPC(phi, "diana", k = 2, adjust = TRUE),
                                    difference = diff.uni(phi, 1),
                                    variance = var(phi, na.rm = TRUE))
  data$clusters_hclust <- cutree(hclust(dist(data$phi)), k = 2)
  data$clusters_diana <- cutree(diana(dist(data$phi)), k = 2)
  germany_polarization$difference_hclust <- diff_multidim(data, cols = 2, clusters = 7)
  germany_polarization$difference_diana <- diff_multidim(data, cols = 2, clusters = 8)
  return(germany_polarization)
}

italy_boot <- foreach(i = 1:sets) %dorng% {
  data <- sample_n(tbl = italy, size = nrow(italy), replace = TRUE)
  italy_polarization <- summarise(data, CPC = CPC(phi, "kmeans", k = 2, adjust = TRUE, nstart = 30),
                                  CPC_hclust = CPC(phi, "hclust", k = 2, adjust = TRUE),
                                  CPC_diana = CPC(phi, "diana", k = 2, adjust = TRUE),
                                  difference = diff.uni(phi, 1),
                                  variance = var(phi, na.rm = TRUE))
  data$clusters_hclust <- cutree(hclust(dist(data$phi)), k = 2)
  data$clusters_diana <- cutree(diana(dist(data$phi)), k = 2)
  italy_polarization$difference_hclust <- diff_multidim(data, cols = 2, clusters = 7)
  italy_polarization$difference_diana <- diff_multidim(data, cols = 2, clusters = 8)
  return(italy_polarization)
}

netherlands_boot <- foreach(i = 1:sets) %dorng% {
  data <- sample_n(tbl = netherlands, size = nrow(netherlands), replace = TRUE)
  netherlands_polarization <- summarise(data, CPC = CPC(phi, "kmeans", k = 3, adjust = TRUE,
                                                        nstart = 30),
                                        CPC_hclust = CPC(phi, "hclust", k = 3, adjust = TRUE),
                                        CPC_diana = CPC(phi, "diana", k = 3, adjust = TRUE),
                                        difference = diff.uni(phi, 1),
                                        variance = var(phi, na.rm = TRUE))
  data$clusters_hclust <- cutree(hclust(dist(data$phi)), k = 3)
  data$clusters_diana <- cutree(diana(dist(data$phi)), k = 3)
  netherlands_polarization$difference_hclust <- diff_multidim(data, cols = 2, clusters = 7)
  netherlands_polarization$difference_diana <- diff_multidim(data, cols = 2, clusters = 8)
  return(netherlands_polarization)
}

spain_boot <- foreach(i = 1:sets) %dorng% {
  data <- sample_n(tbl = spain, size = nrow(spain), replace = TRUE)
  spain_polarization <- summarise(data, CPC = CPC(phi, "kmeans", k = 3, adjust = TRUE, nstart = 30),
                                  CPC_hclust = CPC(phi, "hclust", k = 3, adjust = TRUE),
                                  CPC_diana = CPC(phi, "diana", k = 3, adjust = TRUE),
                                  difference = diff.uni(phi, 1),
                                  variance = var(phi, na.rm = TRUE))
  data$clusters_hclust <- cutree(hclust(dist(data$phi)), k = 3)
  data$clusters_diana <- cutree(diana(dist(data$phi)), k = 3)
  spain_polarization$difference_hclust <- diff_multidim(data, cols = 2, clusters = 7)
  spain_polarization$difference_diana <- diff_multidim(data, cols = 2, clusters = 8)
  return(spain_polarization)
}

UK_boot <- foreach(i = 1:sets) %dorng% {
  data <- sample_n(tbl = UK, size = nrow(UK), replace = TRUE)
  UK_polarization <- summarise(data, CPC = CPC(phi, "kmeans", k = 2, adjust = TRUE, nstart = 30),
                               CPC_hclust = CPC(phi, "hclust", k = 2, adjust = TRUE),
                               CPC_diana = CPC(phi, "diana", k = 2, adjust = TRUE),
                               difference = diff.uni(phi, 1),
                               variance = var(phi, na.rm = TRUE))
  data$clusters_hclust <- cutree(hclust(dist(data$phi)), k = 2)
  data$clusters_diana <- cutree(diana(dist(data$phi)), k = 2)
  UK_polarization$difference_hclust <- diff_multidim(data, cols = 2, clusters = 7)
  UK_polarization$difference_diana <- diff_multidim(data, cols = 2, clusters = 8)
  return(UK_polarization)
}

US_boot <- foreach(i = 1:sets) %dorng% {
  data <- sample_n(tbl = US, size = nrow(US), replace = TRUE)
  US_polarization <- summarise(data, CPC = CPC(phi, "kmeans", k = 2, adjust = TRUE, nstart = 30),
                               CPC_hclust = CPC(phi, "hclust", k = 2, adjust = TRUE),
                               CPC_diana = CPC(phi, "diana", k = 2, adjust = TRUE),
                               difference = diff.uni(phi, 1),
                               variance = var(phi, na.rm = TRUE))
  data$clusters_hclust <- cutree(hclust(dist(data$phi)), k = 2)
  data$clusters_diana <- cutree(diana(dist(data$phi)), k = 2)
  US_polarization$difference_hclust <- diff_multidim(data, cols = 2, clusters = 7)
  US_polarization$difference_diana <- diff_multidim(data, cols = 2, clusters = 8)
  return(US_polarization)
}

stopImplicitCluster()

#collapse lists into single data frames and add country names

germany_collapse <- Reduce(function(x, y) bind_rows(x, y), germany_boot) %>%
  mutate(Country = "Germany")

italy_collapse <- Reduce(function(x, y) bind_rows(x, y), italy_boot) %>%
  mutate(Country = "Italy")

netherlands_collapse <- Reduce(function(x, y) bind_rows(x, y), netherlands_boot) %>%
  mutate(Country = "Netherlands")

spain_collapse <- Reduce(function(x, y) bind_rows(x, y), spain_boot) %>%
  mutate(Country = "Spain")

UK_collapse <- Reduce(function(x, y) bind_rows(x, y), UK_boot) %>%
  mutate(Country = "United Kingdom")

US_collapse <- Reduce(function(x, y) bind_rows(x, y), US_boot) %>%
  mutate(Country = "United States")

#combine estimates into single data frame, calculate mean and standard error of bootstrapped polarization estimates, and pivot data frame for plotting

twitter_polarization <- rbind(germany_collapse, italy_collapse, netherlands_collapse, spain_collapse,
                              UK_collapse, US_collapse) %>%
  group_by(Country) %>%
  summarise(CPC_mean = mean(CPC, na.rm = TRUE),
            CPC_se = sd(CPC, na.rm = TRUE),
            CPC_hclust_mean = mean(CPC_hclust, na.rm = TRUE),
            CPC_hclust_se = sd(CPC_hclust, na.rm = TRUE),
            CPC_diana_mean = mean(CPC_diana, na.rm = TRUE),
            CPC_diana_se = sd(CPC_diana, na.rm = TRUE),
            difference_mean = mean(difference, na.rm = TRUE),
            difference_se = sd(difference, na.rm = TRUE),
            difference_hclust_mean = mean(difference_hclust, na.rm = TRUE),
            difference_hclust_se = sd(difference_hclust, na.rm = TRUE),
            difference_diana_mean = mean(difference_diana, na.rm = TRUE),
            difference_diana_se = sd(difference_diana, na.rm = TRUE),
            variance_mean = mean(variance, na.rm = TRUE),
            variance_se = sd(variance, na.rm = TRUE)) %>%
  pivot_longer(cols = c(2, 4, 6, 8, 10, 12, 14), names_to = "Method", values_to = "mean") %>%
  pivot_longer(cols = 2:8, names_to = "method", values_to = "se") %>%
  mutate(Method = ifelse(Method == "CPC_mean", "CPC",
                         ifelse(Method == "CPC_hclust_mean", "CPC (Agglomerative)",
                                ifelse(Method == "CPC_diana_mean", "CPC (Divisive)",
                                       ifelse(Method == "difference_mean", "Difference",
                                              ifelse(Method == "difference_hclust_mean",
                                                     "Difference (Agglomerative)",
                                                     ifelse(Method == "difference_diana_mean",
                                                            "Difference (Divisive)",
                                                            ifelse(Method == "variance_mean",
                                                                   "Variance", NA))))))),
         method = ifelse(method == "CPC_se", "CPC",
                         ifelse(method == "CPC_hclust_se", "CPC (Agglomerative)",
                                ifelse(method == "CPC_diana_se", "CPC (Divisive)",
                                       ifelse(method == "difference_se", "Difference",
                                              ifelse(method == "difference_hclust_se",
                                                     "Difference (Agglomerative)",
                                                     ifelse(method == "difference_diana_se",
                                                            "Difference (Divisive)",
                                                            ifelse(method == "variance_se",
                                                                   "Variance", NA)))))))) %>%
  filter(Method == method) %>%
  dplyr::select(-method)

#change levels for plotting

twitter$Country <- factor(twitter$Country,
                          levels = c("Italy", "Germany", "United States", "United Kingdom",
                                     "Netherlands", "Spain"))

twitter_polarization$Country <- factor(twitter_polarization$Country,
                                       levels = c("Italy", "Germany", "United States",
                                                  "United Kingdom", "Netherlands", "Spain"))

###################################################################################
#VDem party system polarization estimates
###################################################################################

#calculate polarization estimates in VDem data

vdem_system <- group_by(vdem, country) %>%
  summarise(lr_avg = mean(lr, na.rm = TRUE))

vdem_polarization <- full_join(vdem, vdem_system) %>%
  group_by(country) %>%
  mutate(score = vote*((lr - lr_avg)/3)^2) %>%
  summarise(polar = sqrt(sum(score, na.rm = TRUE))) %>%
  ungroup() %>%
  mutate(Method = "Expert-Coded",
         se = NA) %>%
  rename(Country = country, mean = polar)

#row bind into data frame for plotting

vdem_polarization <- vdem_polarization[,c(1, 3, 2, 4)]

twitter_polarization <- rbind(twitter_polarization, vdem_polarization)

###################################################################################
#plots and tables of Twitter polarization estimates
###################################################################################

#relevel variable for plotting

twitter_polarization$Method <- factor(twitter_polarization$Method,
                                      levels = c("Expert-Coded", "CPC", "CPC (Agglomerative)",
                                                 "CPC (Divisive)", "Difference",
                                                 "Difference (Agglomerative)",
                                                 "Difference (Divisive)", "Variance"))

#plot Twitter density plots

pdf("figure3A.pdf", width = 9, height = 7)

ggplot(twitter, aes(x = phi)) +
  geom_density(size = 1) +
  facet_wrap(~ Country, scales = "free_y") +
  labs(x = "Latent Ideology Estimate", y = "Density",
       title = "A: Kernel Density Plots of Ideal Points") +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        axis.text = element_text(size = 16),
        plot.title = element_text(size = 22, hjust = -0.4))

dev.off()

#plot Twitter polarization estimates with confidence intervals

pdf("figure3B.pdf", width = 9, height = 7)

ggplot(subset(twitter_polarization, Method %in% c("CPC", "Difference", "Variance", "Expert-Coded")),
       aes(x = mean, y = Country)) +
  geom_point(stat = "identity", size = 3) +
  geom_errorbarh(aes(xmin = mean - 1.96*se, xmax = mean + 1.96*se), height = 0.2) +
  facet_wrap(~ Method, scales = "free_x") +
  labs(x = "Level of Polarization", y = "Country", title = "B: Ideological Polarization Estimates") +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        axis.text = element_text(size = 16),
        plot.title = element_text(size = 22, hjust = -1.3))

dev.off()

#plot Twitter polarization estimates by clustering method (Figure R6)

twitter_polarization <- mutate(twitter_polarization,
                               Method = as.character(Method),
                               Method = ifelse(Method == "CPC", "CPC (K-Means)",
                                               ifelse(Method == "Difference", "Difference (K-Means)",
                                                      Method)))

twitter_polarization$Country <- factor(twitter_polarization$Country,
                                       levels = c("Germany", "Italy", "United States", "Netherlands",
                                                  "United Kingdom", "Spain"))

twitter_polarization$Method <- factor(twitter_polarization$Method,
                                      levels = c("CPC (K-Means)", "CPC (Agglomerative)",
                                                 "CPC (Divisive)", "Expert-Coded",
                                                 "Difference (K-Means)", "Difference (Agglomerative)",
                                                 "Difference (Divisive)", "Variance"))

pdf("figureR6.pdf", width = 9, height = 7)

ggplot(subset(twitter_polarization, !Method %in% c("Expert-Coded", "Variance")),
       aes(x = mean, y = Country)) +
  geom_point(stat = "identity", size = 2) +
  geom_errorbarh(aes(xmin = mean - 1.96*se, xmax = mean + 1.96*se), height = 0.2) +
  facet_wrap(~ Method, scales = "free_x") +
  labs(x = "Level of Polarization", y = "Country") +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        strip.text.x = element_text(size = 12),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 16))

dev.off()

#print table of estimates and standard errors (Table R2)

twitter_polarization <- cbind(twitter_polarization[,1:2], round(twitter_polarization[,3:4], 3))

print(twitter_polarization[,c("Country", "Method", "mean", "se")], n = 24, na.print = "")

###################################################################################
#calculation of CSES affective polarization estimates
###################################################################################

#set seed and number of sets and open parallel session

set.seed(456)
sets <- 100

registerDoParallel(cores = cores)

#bootstrap data sets, calculate polarization estimates for each chamber using four measures, and scale measures for presentation on same plot

cses_boot <- foreach(i = 1:sets) %dorng% {
  polar <- as.data.frame(matrix(c(unique(cses$cy), rep(NA, length(unique(cses$cy))*3)), ncol = 4,
                                byrow = FALSE))
  colnames(polar) <- c("cy", "Difference", "Variance", "CPC")
  
  for (j in unique(cses$cy)) {
    data <- filter(cses, cy == j) %>%
      ungroup() %>%
      sample_n(size = nrow(.), replace = TRUE) %>%
      dplyr::select(party, starts_with("affect"))
    
    data <- data[colSums(!is.na(data)) > 0]
    
    diff <- diff_multidim(na.omit(data), cols = 2:ncol(data), clusters = 1)
    var <- tr(cov(na.omit(data[,2:ncol(data)])))
    CPC <- CPC(na.omit(data), "manual", cols = 2:ncol(data), clusters = 1, adjust = TRUE)
    
    polar[polar$cy == j,] <- c(j, diff, var, CPC)
  }
  
  polar <- mutate(polar, country = substr(cy, 1, nchar(cy) - 4),
                  year = as.numeric(substr(cy, nchar(cy) - 3, nchar(cy)))) %>%
    dplyr::select(-cy) %>%
    filter(!is.na(diff), !is.na(var), !is.na(CPC))
  
  return(polar)
}

stopImplicitCluster()

#collapse list into single data frame

cses_polar <- Reduce(function(x, y) bind_rows(x, y), cses_boot)

#calculate mean and standard error of bootstrapped polarization estimates

cses_polar <- pivot_longer(cses_polar, cols = 1:3, names_to = "Measure", values_to = "polar") %>%
  group_by(country, year, Measure) %>%
  summarise(mean = mean(as.numeric(polar), na.rm = TRUE),
            se = sd(as.numeric(polar), na.rm = TRUE),
            low = mean - se*1.96,
            hi = mean + se*1.96) %>%
  ungroup()

#estimate bootstrapped correlations and associated standard errors

cses_cors <- group_by(cses, country, year) %>%
  summarise(efficacy1 = sum(efficacy1 %in% c(4, 5), na.rm = TRUE)/n(),
            efficacy2 = sum(efficacy2 %in% c(4, 5), na.rm = TRUE)/n(),
            close = sum(close == 3, na.rm = TRUE)/n(),
            extreme = sum(extreme %in% c(4, 5), na.rm = TRUE)/n()) %>%
  mutate(efficacy1 = ifelse(country == "Ireland", NA, efficacy1),
         close = ifelse(country %in% c("Kenya", "Ireland"), NA, close))

cses_boot_cors <- lapply(cses_boot, function (x) {
  left_join(x, cses_cors, by = c("country", "year"))
})

cses_polar_cors <- Reduce(function(x, y) bind_rows(x, y), cses_boot_cors)

cses_polar_cors$iter <- rep(seq(1:length(unique(cses$country))), each = sets)

cses_polar_cors <- pivot_longer(cses_polar_cors, cols = 1:3, names_to = "Measure",
                                values_to = "polar") %>%
  pivot_longer(cols = 3:6, names_to = "var", values_to = "value") %>%
  group_by(iter, Measure, var) %>%
  summarise(cor = cor(as.numeric(polar), as.numeric(value), use = "complete.obs")) %>%
  ungroup() %>%
  group_by(Measure, var) %>%
  summarise(mean = mean(cor),
            se = sd(cor),
            low = mean - se*1.96,
            hi = mean + se *1.96) %>%
  mutate(var = ifelse(var == "efficacy1", "Who in Power\nMakes Difference",
                      ifelse(var == "efficacy2", "Vote Makes\nDifference",
                             ifelse(var == "close", "Very Close\nto Party",
                                    ifelse(var == "extreme", "Ideological\nExtremity", NA)))))

###################################################################################
#plots of CSES polarization estimates and correlations
###################################################################################

#relevel to order countries by CPC estimate

cses_polar$country <- factor(cses_polar$country,
                             levels = subset(cses_polar[order(cses_polar$mean),],
                                             Measure == "CPC")$country)

#change base size

theme_set(theme_bw(base_size = 20))

#plot affective polarization estimates (Figure R7)

pdf("figureR7.pdf", width = 9, height = 7)

ggplot(cses_polar, aes(x = mean, y = country)) +
  geom_point(position = position_dodge(width = 0.2)) +
  geom_errorbarh(aes(xmin = low, xmax = hi), height = 0.5, position = position_dodge(width = 0.2)) +
  labs(x = "Level of Polarization", y = "Country") +
  facet_wrap(~ Measure, scales = "free_x") +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        axis.text = element_text(size = 16))

dev.off()

#print table of estimates and standard errors (Table R3)

cses_polar_table <- cbind(cses_polar[,c(1, 3)], round(cses_polar[,c(4, 5)], 3))

print(cses_polar_table)

#relevel variables for plotting

cses_polar_cors$Measure <- factor(cses_polar_cors$Measure,
                                  levels = c("CPC", "Difference", "Variance"))
cses_polar_cors$var <- factor(cses_polar_cors$var,
                              levels = c("Ideological\nExtremity", "Very Close\nto Party",
                                         "Vote Makes\nDifference", "Who in Power\nMakes Difference"))

#plot correlations between affective polarization and other variables (Figure 4)

pdf("figure4.pdf", width = 9, height = 7)

ggplot(cses_polar_cors, aes(x = mean, y = var)) +
  geom_point(size = 3) +
  geom_errorbarh(aes(xmin = low, xmax = hi), height = 0.1) +
  geom_vline(xintercept = 0, linetype = "dashed") +
  facet_wrap(~ Measure, nrow = 1) +
  labs(x = "Correlation with Affective Polarization") +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        axis.text.x = element_text(size = 16),
        axis.text.y = element_text(size = 16),
        axis.title.y = element_blank(),
        legend.position = "bottom")

dev.off()

#create table of correlations and standard errors

cses_cortable_mean <- cses_polar_cors[cses_polar_cors$Measure %in% c("Difference", "Variance", "CPC"),
                                      c("var", "Measure", "mean")] %>%
  pivot_wider(names_from = "Measure", values_from = "mean") %>%
  mutate(metric = "mean")

cses_cortable_se <- cses_polar_cors[cses_polar_cors$Measure %in% c("Difference", "Variance", "CPC"),
                                    c("var", "Measure", "se")] %>%
  pivot_wider(names_from = "Measure", values_from = "se") %>%
  mutate(metric = "se")

cses_cortable <- rbind(cses_cortable_mean, cses_cortable_se)
cses_cortable <- cses_cortable[order(cses_cortable$var), c(1, 3, 4, 2, 5)]

cses_cortable$Difference <- round(cses_cortable$Difference, 3)
cses_cortable$Variance <- round(cses_cortable$Variance, 3)
cses_cortable$CPC <- round(cses_cortable$CPC, 3)

#print table of correlations and standard errors (Table R4)

cses_cortable <- cbind(cses_cortable[,1], round(cses_cortable[,2:4], 3), cses_cortable[,5])

print(cses_cortable)

###################################################################################
#investigation of feature weight on CPC with CSES data
###################################################################################

#use CSES data as starting point (Canada, Norway, Bulgaria)

cses_low <- filter(cses, country == "Canada") %>%
  ungroup() %>%
  sample_n(size = nrow(.), replace = TRUE) %>%
  dplyr::select(party, starts_with("affect"))

cses_med <- filter(cses, country == "Norway") %>%
  ungroup() %>%
  sample_n(size = nrow(.), replace = TRUE) %>%
  dplyr::select(party, starts_with("affect"))

cses_hi <- filter(cses, country == "Bulgaria") %>%
  ungroup() %>%
  sample_n(size = nrow(.), replace = TRUE) %>%
  dplyr::select(party, starts_with("affect"))

cses_low <- cses_low[colSums(!is.na(cses_low)) > 0]
cses_med <- cses_med[colSums(!is.na(cses_med)) > 0]
cses_hi <- cses_hi[colSums(!is.na(cses_hi)) > 0]

#find baseline BSS and TWSS levels

cses_low_CPC <- CPC(na.omit(cses_low), "manual", cols = 2:ncol(cses_low), clusters = 1, model = TRUE)
cses_med_CPC <- CPC(na.omit(cses_med), "manual", cols = 2:ncol(cses_med), clusters = 1, model = TRUE)
cses_hi_CPC <- CPC(na.omit(cses_hi), "manual", cols = 2:ncol(cses_hi), clusters = 1, model = TRUE)

#calculate CPC with varying BSS, holding TWSS constant

set.seed(999)

polar_low_BSS <- as.data.frame(matrix(NA, ncol = 4, nrow = 1000))
polar_med_BSS <- as.data.frame(matrix(NA, ncol = 4, nrow = 1000))
polar_hi_BSS <- as.data.frame(matrix(NA, ncol = 4, nrow = 1000))

polar_low_BSS[,1] <- runif(1000, cses_low_CPC$BSS/2, cses_low_CPC$BSS*1.5)
polar_low_BSS[,2] <- runif(1000, cses_low_CPC$TWSS/2, cses_low_CPC$TWSS*1.5)
polar_low_BSS[,3] <- polar_low_BSS[,1] + polar_low_BSS[,2]
polar_low_BSS[,4] <- polar_low_BSS[,1]/polar_low_BSS[,3]
polar_low_BSS[,1] <- scale(polar_low_BSS[,1])[,1]
polar_low_BSS[,2] <- scale(polar_low_BSS[,2])[,1]
colnames(polar_low_BSS) <- c("BSS", "WSS", "TSS", "CPC")

polar_med_BSS[,1] <- runif(1000, cses_med_CPC$BSS/2, cses_med_CPC$BSS*1.5)
polar_med_BSS[,2] <- runif(1000, cses_med_CPC$TWSS/2, cses_med_CPC$TWSS*1.5)
polar_med_BSS[,3] <- polar_med_BSS[,1] + polar_med_BSS[,2]
polar_med_BSS[,4] <- polar_med_BSS[,1]/polar_med_BSS[,3]
polar_med_BSS[,1] <- scale(polar_med_BSS[,1])[,1]
polar_med_BSS[,2] <- scale(polar_med_BSS[,2])[,1]
colnames(polar_med_BSS) <- c("BSS", "WSS", "TSS", "CPC")

polar_hi_BSS[,1] <- runif(1000, cses_hi_CPC$BSS/2, cses_hi_CPC$BSS*1.5)
polar_hi_BSS[,2] <- runif(1000, cses_hi_CPC$TWSS/2, cses_hi_CPC$TWSS*1.5)
polar_hi_BSS[,3] <- polar_hi_BSS[,1] + polar_hi_BSS[,2]
polar_hi_BSS[,4] <- polar_hi_BSS[,1]/polar_hi_BSS[,3]
polar_hi_BSS[,1] <- scale(polar_hi_BSS[,1])[,1]
polar_hi_BSS[,2] <- scale(polar_hi_BSS[,2])[,1]
colnames(polar_hi_BSS) <- c("BSS", "WSS", "TSS", "CPC")

#fit OLS models

cses_low_lm <- lm(CPC ~ BSS + WSS, data = polar_low_BSS)

cses_med_lm <- lm(CPC ~ BSS + WSS, data = polar_med_BSS)

cses_hi_lm <- lm(CPC ~ BSS + WSS, data = polar_hi_BSS)

#print table of OLS models (Table S5)

stargazer(cses_low_lm, cses_med_lm, cses_hi_lm,
          title = "Effect of Individual Features on CPC Estimates by Level of Polarization. $BSS$ and $WSS$ unit-normalized. Standard errors in parentheses.",
          column.labels = c("Low Polarization", "Medium Polarization", "High Polarization"),
          omit.stat = c("f"),
          star.char = c("*"), star.cutoffs = c(0.05))

#reshape and combine data for plotting

polar_low_BSS <- pivot_longer(polar_low_BSS, cols = 1:2, names_to = "Feature",
                              values_to = "Value") %>%
  mutate(level = "Low Polarization")

polar_med_BSS <- pivot_longer(polar_med_BSS, cols = 1:2, names_to = "Feature",
                              values_to = "Value") %>%
  mutate(level = "Medium Polarization")

polar_hi_BSS <- pivot_longer(polar_hi_BSS, cols = 1:2, names_to = "Feature", values_to = "Value") %>%
  mutate(level = "High Polarization")

polar_BSS <- rbind(polar_low_BSS, polar_med_BSS, polar_hi_BSS)

#relevel for plotting

polar_BSS$level <- factor(polar_BSS$level, levels = c("Low Polarization", "Medium Polarization",
                                                      "High Polarization"))

#plot descriptive plot (Figure S8)

pdf("figureS8.pdf", width = 9, height = 7)

grid.draw(shift_legend(ggplot(polar_BSS,
                              aes(x = Value, y = CPC, shape = Feature, linetype = Feature)) +
                         geom_smooth(alpha = 0.3, color = "black", method = "loess") +
                         facet_wrap(~ level, ncol = 2, scales = "free_y") +
                         scale_shape_manual(values = c(1, 4)) +
                         labs(x = "Value of Feature") +
                         theme(strip.background = element_rect(color = "white", fill = "white"),
                               axis.text = element_text(size = 16))))

dev.off()

#draw added-variable plots (Figure S9)

pdf("figureS9A.pdf", width = 9, height = 7)

avPlots(cses_low_lm, pch = 20, cex = 0.5, cex.lab = 1.5, cex.axis = 1.25, id = FALSE, col = "gray",
        col.lines = "black", ylab = "CPC", main = "")

dev.off()

pdf("figureS9B.pdf", width = 9, height = 7)

avPlots(cses_med_lm, pch = 20, cex = 0.5, cex.lab = 1.5, cex.axis = 1.25, id = FALSE, col = "gray",
        col.lines = "black", ylab = "CPC", main = "")

dev.off()

pdf("figureS9C.pdf", width = 9, height = 7)

avPlots(cses_hi_lm, pch = 20, cex = 0.5, cex.lab = 1.5, cex.axis = 1.25, id = FALSE, col = "gray",
        col.lines = "black", ylab = "CPC", main = "")

dev.off()

###################################################################################
#calculation of DW-NOMINATE polarization estimates (first dimension)
###################################################################################

#set seed and number of sets and open parallel session

set.seed(111)
sets <- 100

registerDoParallel(cores = cores)

#bootstrap data sets, calculate polarization estimates for each chamber using four measures, and scale measures for presentation on same plot

house_boot <- foreach(i = 1:sets) %dorng% {
  data <- sample_n(tbl = nom_house, size = nrow(nom_house), replace = TRUE)
  
  nom_diff <- c()
  nom_var <- c()
  nom_CPC <- c()
  
  for (t in sort(unique(data$congress))) {
    data_temp <- filter(data, congress == t)
    
    nom_diff <- c(nom_diff, diff_multidim(data_temp, cols = 3, clusters = 7))
    nom_var <- c(nom_var, var(data_temp$nominate_dim1, na.rm = TRUE))
    nom_CPC <- c(nom_CPC, CPC(data_temp, "manual", cols = 3, clusters = 7, adjust = TRUE))
  }
  
  house_polarization <- matrix(c(sort(unique(data$congress)), nom_diff, nom_var, nom_CPC), ncol = 4,
                               byrow = FALSE)
  colnames(house_polarization) <- c("congress", "nom_diff", "nom_var", "nom_CPC")
  
  house_polarization <- mutate(as.data.frame(house_polarization), nom_diff_scale = scale(nom_diff),
                               nom_var_scale = scale(nom_var), nom_CPC_scale = scale(nom_CPC))
  
  return(house_polarization)
}

senate_boot <- foreach(i = 1:sets) %dorng% {
  data <- sample_n(tbl = nom_senate, size = nrow(nom_senate), replace = TRUE)
  
  nom_diff <- c()
  nom_var <- c()
  nom_CPC <- c()
  
  for (t in sort(unique(data$congress))) {
    data_temp <- filter(data, congress == t)
    
    nom_diff <- c(nom_diff, diff_multidim(data_temp, cols = 3, clusters = 7))
    nom_var <- c(nom_var, var(data_temp$nominate_dim1, na.rm = TRUE))
    nom_CPC <- c(nom_CPC, CPC(data_temp, "manual", cols = 3, clusters = 7, adjust = TRUE))
  }
  
  senate_polarization <- matrix(c(sort(unique(data$congress)), nom_diff, nom_var, nom_CPC), ncol = 4,
                                byrow = FALSE)
  colnames(senate_polarization) <- c("congress", "nom_diff", "nom_var", "nom_CPC")
  
  senate_polarization <- mutate(as.data.frame(senate_polarization), nom_diff_scale = scale(nom_diff),
                                nom_var_scale = scale(nom_var), nom_CPC_scale = scale(nom_CPC))
  
  return(senate_polarization)
}

stopImplicitCluster()

#collapse lists into single data frames

house_collapse <- Reduce(function(x, y) bind_rows(x, y), house_boot)
senate_collapse <- Reduce(function(x, y) bind_rows(x, y), senate_boot)

#calculate mean and standard error of bootstrapped polarization estimates

house_polarization <- group_by(house_collapse, congress) %>%
  dplyr::summarize(nom_diff_mean = mean(nom_diff, na.rm = TRUE),
                   nom_var_mean = mean(nom_var, na.rm = TRUE),
                   nom_CPC_mean = mean(nom_CPC, na.rm = TRUE),
                   nom_diff_scale_mean = mean(nom_diff_scale, na.rm = TRUE),
                   nom_var_scale_mean = mean(nom_var_scale, na.rm = TRUE),
                   nom_CPC_scale_mean = mean(nom_CPC_scale, na.rm = TRUE)) %>%
  mutate(chamber = "House of Representatives")

senate_polarization <- group_by(senate_collapse, congress) %>%
  dplyr::summarize(nom_diff_mean = mean(nom_diff, na.rm = TRUE),
                   nom_var_mean = mean(nom_var, na.rm = TRUE),
                   nom_CPC_mean = mean(nom_CPC, na.rm = TRUE),
                   nom_diff_scale_mean = mean(nom_diff_scale, na.rm = TRUE),
                   nom_var_scale_mean = mean(nom_var_scale, na.rm = TRUE),
                   nom_CPC_scale_mean = mean(nom_CPC_scale, na.rm = TRUE)) %>%
  mutate(chamber = "Senate")

#combine House and Senate polarization estimates into single data frame

nom_polarization <- rbind(house_polarization, senate_polarization) %>%
  mutate(nom_diff_scale_mean = ifelse(is.nan(nom_diff_scale_mean), NA, nom_diff_scale_mean),
         nom_var_scale_mean = ifelse(is.nan(nom_var_scale_mean), NA, nom_var_scale_mean),
         nom_CPC_scale_mean = ifelse(is.nan(nom_CPC_scale_mean), NA, nom_CPC_scale_mean))

#collapse measures into single column and add column denoting estimate type

nom_polarization <- dplyr::select(nom_polarization, congress, chamber, starts_with("nom")) %>%
  dplyr::select(congress, chamber, ends_with("mean")) %>%
  pivot_longer(cols = c("nom_diff_mean", "nom_var_mean", "nom_CPC_mean", "nom_diff_scale_mean",
                        "nom_var_scale_mean", "nom_CPC_scale_mean"),
               names_to = "measure", values_to = "mean") %>%
  mutate(measure = substr(measure, 5, nchar(measure)),
         scaled = ifelse(measure %in% c("diff_scale_mean", "var_scale_mean", "CPC_scale_mean"), 1,
                         0)) %>%
  mutate(measure = ifelse(measure %in% c("diff_mean", "diff_scale_mean"), "Difference",
                          ifelse(measure %in% c("var_mean", "var_scale_mean"), "Variance",
                                 ifelse(measure %in% c("CPC_mean", "CPC_scale_mean"), "CPC", NA))))

#relevel faceting variables

nom_polarization$measure <- factor(nom_polarization$measure,
                                   levels = c("Difference", "Variance", "CPC"))

###################################################################################
#calculation of DW-NOMINATE polarization estimates (both dimensions)
###################################################################################

#set seed and open parallel session

set.seed(456)

registerDoParallel(cores = cores)

#bootstrap data sets, calculate polarization estimates for each chamber using four measures, and scale measures for presentation on same plot

house_boot2 <- foreach(i = 1:sets) %dorng% {
  data <- sample_n(tbl = nom_house, size = nrow(nom_house), replace = TRUE)
  
  nom_diff <- c()
  nom_var <- c()
  nom_CPC <- c()
  
  for (t in sort(unique(data$congress))) {
    data_temp <- filter(data, congress == t)
    
    nom_diff <- c(nom_diff, diff_multidim(data_temp, cols = 3:4, clusters = 8))
    nom_var <- c(nom_var, tr(cov(matrix(c(data_temp$nominate_dim1, data_temp$nominate_dim2),
                                        ncol = 2), use = "complete.obs")))
    nom_CPC <- c(nom_CPC, CPC(data_temp, "manual", cols = 3:4, clusters = 8, adjust = TRUE))
  }
  
  house_polarization <- matrix(c(sort(unique(data$congress)), nom_diff, nom_var, nom_CPC), ncol = 4,
                               byrow = FALSE)
  colnames(house_polarization) <- c("congress", "nom_diff", "nom_var", "nom_CPC")
  
  house_polarization <- mutate(as.data.frame(house_polarization),
                               nom_diff_scale = scale(nom_diff),
                               nom_var_scale = scale(nom_var),
                               nom_CPC_scale = scale(nom_CPC))
  
  return(house_polarization)
}

senate_boot2 <- foreach(i = 1:sets) %dorng% {
  data <- sample_n(tbl = nom_senate, size = nrow(nom_senate), replace = TRUE)
  
  nom_diff <- c()
  nom_var <- c()
  nom_CPC <- c()
  
  for (t in sort(unique(data$congress))) {
    data_temp <- filter(data, congress == t)
    
    nom_diff <- c(nom_diff, diff_multidim(data_temp, cols = 3:4, clusters = 8))
    nom_var <- c(nom_var, tr(cov(matrix(c(data_temp$nominate_dim1, data_temp$nominate_dim2),
                                        ncol = 2), use = "complete.obs")))
    nom_CPC <- c(nom_CPC, CPC(data_temp, "manual", cols = 3:4, clusters = 8, adjust = TRUE))
  }
  
  senate_polarization <- matrix(c(sort(unique(data$congress)), nom_diff, nom_var, nom_CPC), ncol = 4,
                                byrow = FALSE)
  colnames(senate_polarization) <- c("congress", "nom_diff", "nom_var", "nom_CPC")
  
  senate_polarization <- mutate(as.data.frame(senate_polarization),
                                nom_diff_scale = scale(nom_diff),
                                nom_var_scale = scale(nom_var),
                                nom_CPC_scale = scale(nom_CPC))
  
  return(senate_polarization)
}

stopImplicitCluster()

#collapse lists into single data frames

house_collapse2 <- Reduce(function(x, y) bind_rows(x, y), house_boot2)
senate_collapse2 <- Reduce(function(x, y) bind_rows(x, y), senate_boot2)

#calculate mean and standard error of bootstrapped polarization estimates

house_polarization2 <- group_by(house_collapse2, congress) %>%
  dplyr::summarize(nom_diff_mean = mean(nom_diff, na.rm = TRUE),
                   nom_var_mean = mean(nom_var, na.rm = TRUE),
                   nom_CPC_mean = mean(nom_CPC, na.rm = TRUE),
                   nom_diff_scale_mean = mean(nom_diff_scale, na.rm = TRUE),
                   nom_var_scale_mean = mean(nom_var_scale, na.rm = TRUE),
                   nom_CPC_scale_mean = mean(nom_CPC_scale, na.rm = TRUE)) %>%
  mutate(chamber = "House of Representatives")

senate_polarization2 <- group_by(senate_collapse2, congress) %>%
  dplyr::summarize(nom_diff_mean = mean(nom_diff, na.rm = TRUE),
                   nom_var_mean = mean(nom_var, na.rm = TRUE),
                   nom_CPC_mean = mean(nom_CPC, na.rm = TRUE),
                   nom_diff_scale_mean = mean(nom_diff_scale, na.rm = TRUE),
                   nom_var_scale_mean = mean(nom_var_scale, na.rm = TRUE),
                   nom_CPC_scale_mean = mean(nom_CPC_scale, na.rm = TRUE)) %>%
  mutate(chamber = "Senate")

#combine House and Senate polarization estimates into single data frame

nom_polarization2 <- rbind(house_polarization2, senate_polarization2) %>%
  mutate(nom_diff_scale_mean = ifelse(is.nan(nom_diff_scale_mean), NA, nom_diff_scale_mean),
         nom_var_scale_mean = ifelse(is.nan(nom_var_scale_mean), NA, nom_var_scale_mean),
         nom_CPC_scale_mean = ifelse(is.nan(nom_CPC_scale_mean), NA, nom_CPC_scale_mean))

#collapse measures into single column and add column denoting estimate type

nom_polarization2 <- dplyr::select(nom_polarization2, congress, chamber, starts_with("nom")) %>%
  dplyr::select(congress, chamber, ends_with("mean")) %>%
  pivot_longer(cols = c("nom_diff_mean", "nom_var_mean", "nom_CPC_mean", "nom_diff_scale_mean",
                        "nom_var_scale_mean", "nom_CPC_scale_mean"),
               names_to = "measure", values_to = "mean") %>%
  mutate(measure = substr(measure, 5, nchar(measure)),
         scaled = ifelse(measure %in% c("diff_scale_mean", "var_scale_mean", "CPC_scale_mean"), 1,
                         0)) %>%
  mutate(measure = ifelse(measure %in% c("diff_mean", "diff_scale_mean"), "Difference",
                          ifelse(measure %in% c("var_mean", "var_scale_mean"), "Variance",
                                 ifelse(measure %in% c("CPC_mean", "CPC_scale_mean"), "CPC", NA))))

#relevel faceting variables

nom_polarization2$measure <- factor(nom_polarization2$measure,
                                    levels = c("Difference", "Variance", "CPC"))

###################################################################################
#DW-NOMINATE plots with all dimensions
###################################################################################

#combine polarization data frames

nom_polarization$dimension <- "First Dimension Only"
nom_polarization2$dimension <- "Both Dimensions"

nom_polarization_all <- rbind(nom_polarization, nom_polarization2)

nom_polarization_all <- filter(nom_polarization_all, scaled == 1) %>% 
  group_by(dimension, chamber, measure) %>%
  mutate()

#relevel variables

nom_polarization_all$dimension <- factor(nom_polarization_all$dimension,
                                         levels = c("First Dimension Only", "Both Dimensions"))

#add column of years

nom_polarization_all$year <- rep(rep(rep(seq(1789, 2019, by = 2), each = 3), 2), 2)

#change base size

theme_set(theme_bw(base_size = 18))

#plot graph of scaled measures (Figure 2)

pdf("figure2.pdf", width = 9, height = 7)

ggplot(subset(nom_polarization_all, year >= 1975), aes(x = year, y = mean, linetype = measure)) +
  geom_line(size = 0.75) +
  facet_grid(dimension ~ chamber, scales = "free_y") +
  scale_linetype_manual(values = c("dotted", "dashed", "solid")) +
  labs(x = "Year", y = "Polarization Estimates", linetype = "Measure") +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        axis.text = element_text(size = 14),
        legend.position = "bottom",
        legend.key.width = unit(2, "line"))

dev.off()

###################################################################################
#DW-NOMINATE polarization correlations with party unity scores
###################################################################################

#merge unity scores into congressional polarization data frames

brookings_polar <- left_join(nom_polarization, brookings)
brookings_polar2 <- left_join(nom_polarization2, brookings)

brookings_polar <- rbind(brookings_polar, brookings_polar2)

#calculate correlations between congressional polarization and unity scores and vetoes

brookings_cor <- filter(brookings_polar, scaled == 0) %>%
  group_by(chamber, measure, dimension) %>%
  summarise(cor = cor(mean, unity, use = "complete.obs"))

#print table of correlations (Table 1)

print(brookings_cor)

###################################################################################
#simulated distributions for visual comparison
###################################################################################

#generate simulated bimodal distributions

set.seed(456)

variance_values <- c(0.5, 1, 1.5, 2)
means_values <- c(5, 4, 3, 2)
plots <- c()

for (i in variance_values) {
  for (j in means_values) {
    data <- c(rnorm(500, -j, i), rnorm(500, j, i))
    plots <- cbind(plots, data)
    base::print(j)
  }
  base::print(i)
}

#add plot labels and convert to tidy data frame

plots <- as.data.frame(plots)
plots_univariate <- c("5, 0.5", "4, 0.5", "3, 0.5", "2, 0.5", "5, 1", "4, 1", "3, 1", "2, 1",
                      "5, 1.5", "4, 1.5", "3, 1.5", "2, 1.5", "5, 2", "4, 2", "3, 2", "2, 2")
colnames(plots) <- plots_univariate
plots <- pivot_longer(plots, cols = everything()) %>%
  dplyr::arrange(name)

#add plot labels and specify levels for ordering plots

means_labels <- c(rep("means = (-2, 2)", 4000), rep("means = (-3, 3)", 4000),
                  rep("means = (-4, 4)", 4000), rep("means = (-5, 5)", 4000))
variance_labels <- rep(c(rep("sd = 0.5", 1000), rep("sd = 1", 1000), rep("sd = 1.5", 1000),
                         rep("sd = 2", 1000)), 4)
clusters <- rep(rep(c("1", "2"), each = 500), 16)

plots <- cbind(plots, means_labels, variance_labels, clusters)

plots$means_labels <- factor(plots$means_labels, levels = c("means = (-2, 2)", "means = (-3, 3)",
                                                            "means = (-4, 4)", "means = (-5, 5)"))
plots$variance_labels <- factor(plots$variance_labels, levels = c("sd = 2", "sd = 1.5",
                                                                  "sd = 1", "sd = 0.5"))

levels(plots$means_labels) <- c("means = (-2, 2)" = TeX("$\\mu = \\{-2, 2\\}$"),
                                "means = (-3, 3)" = TeX("$\\mu = \\{-3, 3\\}$"),
                                "means = (-4, 4)" = TeX("$\\mu = \\{-4, 4\\}$"),
                                "means = (-5, 5)" = TeX("$\\mu = \\{-5, 5\\}$"))
levels(plots$variance_labels) <- c("sd = 2" = TeX("$\\sigma = 2$"),
                                   "sd = 1.5" = TeX("$\\sigma = 1.5$"),
                                   "sd = 1" = TeX("$\\sigma = 1$"),
                                   "sd = 0.5" = TeX("$\\sigma = 0.5$"))

#plot univariate distributions (Figure S1)

pdf("figureS1.pdf", width = 9, height = 7)

ggplot(plots, aes(x = value)) +
  geom_density(size = 1) +
  facet_grid(variance_labels ~ means_labels, labeller = label_parsed) +
  labs(x = "Value", y = "Density") +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        axis.text = element_text(size = 12))

dev.off()

#reshape data for stylized plots

plots <- filter(plots, means_labels %in% c("means = (-2, 2)", "means = (-5, 5)"),
                variance_labels %in% c("sd = 2", "sd = 0.5")) %>%
  mutate(title = ifelse(name == "2, 2", "Low intergroup heterogeneity,\nlow intragroup homogeneity",
                        ifelse(name == "2, 0.5", "Low intergroup heterogeneity,\nhigh intragroup homogeneity",
                               ifelse(name == "5, 2", "High intergroup heterogeneity,\nlow intragroup homogeneity",
                                      ifelse(name == "5, 0.5", "High intergroup heterogeneity,\nhigh intragroup homogeneity", NA)))))

#calculate polarization estimates

plots_polar <- as.data.frame(matrix(NA, nrow = 4, ncol = 4))
row <- 1

for (i in unique(plots$name)) {
  data <- filter(plots, name == i)
  
  polar_diff <- round(diff_multidim(data, cols = 2, clusters = 5), 3)
  polar_var <- round(var(data$value), 3)
  polar_CPC <- round(CPC(data, "manual", k = 2, cols = 2, clusters = 5), 3)
  
  plots_polar[row,] <- c(i, polar_diff, polar_var, polar_CPC)
  
  row <- row + 1
}

colnames(plots_polar) <- c("name", "diff", "var", "CPC")

#merge polarization estimates into data frame and create labels

plots <- left_join(plots, plots_polar, by = "name") %>%
  mutate(label = paste("Difference = ", diff, "\nVariance = ", var, "\nCPC = ", CPC, sep = ""))

#relevel title variable

plots$title <- factor(plots$title,
                      levels = c("Low intergroup heterogeneity,\nlow intragroup homogeneity",
                                 "Low intergroup heterogeneity,\nhigh intragroup homogeneity",
                                 "High intergroup heterogeneity,\nlow intragroup homogeneity",
                                 "High intergroup heterogeneity,\nhigh intragroup homogeneity"))

#create separate data frame for polarization labels

plots_label <- dplyr::select(plots, title, label) %>%
  distinct()

plots_label$letter <- c("B", "A", "D", "C")

#change base size

theme_set(theme_bw(base_size = 20))

#plot stylized plots (Figure 1)

pdf("figure1.pdf", width = 9, height = 7)

ggplot(plots, aes(x = value)) +
  geom_density(size = 1) +
  geom_text(data = plots_label, aes(x = 12.5, y = 0.25, label = label), hjust = 1, size = 4.3) +
  geom_text(data = plots_label, aes(x = -12.5, y = 0.3, label = letter), hjust = 0, vjust = 1,
            size = 7) +
  facet_wrap(~ title) +
  labs(x = "Value", y = "Density") +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        axis.text = element_text(size = 14))

dev.off()

###################################################################################
#analysis of human labeling
###################################################################################

#reshape data

label_data <- pivot_longer(label_data, cols = contains("_DO"), names_to = "choice_names",
                           values_to = "choices") %>%
  mutate(choice_names = substr(choice_names, 1, nchar(choice_names) - 3)) %>%
  pivot_longer(cols = ends_with("compare"), names_to = "selection_names", values_to = "selection") %>%
  filter(choice_names == selection_names) %>%
  mutate(choices = gsub("[^0-9]", "&", choices)) %>%
  separate(col = choices, into = c("choice1", "choice2"), sep = "&") %>%
  mutate(choice1 = as.numeric(choice1),
         choice2 = as.numeric(choice2),
         choice1_preserve = choice1,
         choice1 = ifelse(choice1 < choice2, choice1, choice2),
         choice2 = ifelse(choice1 == choice2, choice1_preserve, choice2),
         selection = ifelse(selection == choice1, 1, 2)) %>%
  dplyr::select(-choice_names, -selection_names, -choice1_preserve) %>%
  distinct() %>%
  arrange(match(choice2, mixedsort(unique(choice2))), match(choice1, mixedsort(unique(choice1)))) %>%
  mutate(choice = paste(choice1, choice2, sep = "-")) %>%
  dplyr::select(-choice1, -choice2) %>%
  filter(!is.na(selection)) %>%
  pivot_wider(names_from = "choice", values_from = "selection") %>%
  dplyr::select(-ResponseId) %>%
  mutate_all(function(x) na_if(as.character(x), "NULL")) %>%
  filter(if_any(everything(), ~ !is.na(.)))

#create design matrix

design <- llbt.design(label_data, nitems = 50)
object_names <- paste("o", 1:50, sep = "")

#fit model

object_formula <- as.formula(paste("y ~ ", paste(object_names, collapse = "+")))

model <- gnm(object_formula, eliminate = mu, family = poisson, data = design)

#calculate worth and recover human ranking

worth <- llbt.worth(model)

worth <- as.data.frame(cbind(worth, seq(1:50)))
worth <- arrange(worth, desc(worth))
worth <- cbind(worth, seq(1:50))
colnames(worth) <- c("worth", "choice", "rank_human")

#calculate polarization estimates for each distribution and recover rankings

label_diff <- c()
label_var <- c()
label_CPC <- c()

for (i in 1:length(data_list)) {
  data <- data_list[[i]]
  data <- mutate(data, label = ifelse(label == "D", 1, 2))
  
  label_diff[i] <- diff_multidim(data, cols = 2, clusters = 1)
  label_var[i] <- var(data$value)
  label_CPC[i] <- CPC(data, type = "manual", k = 2, adjust = TRUE, cols = 2, clusters = 1)
}

label_diff <- as.data.frame(cbind(label_diff, seq(1:50))) %>%
  arrange(desc(label_diff))
label_var <- as.data.frame(cbind(label_var, seq(1:50))) %>%
  arrange(desc(label_var))
label_CPC <- as.data.frame(cbind(label_CPC, seq(1:50))) %>%
  arrange(desc(label_CPC))

label_diff <- cbind(label_diff, seq(1:50))
label_var <- cbind(label_var, seq(1:50))
label_CPC <- cbind(label_CPC, seq(1:50))

colnames(label_diff) <- c("difference", "choice", "rank_Difference")
colnames(label_var) <- c("variance", "choice", "rank_Variance")
colnames(label_CPC) <- c("CPC", "choice", "rank_CPC")

#merge polarization rankings into data set with human ranking, pivot to longer format, and calculate RMSE and Spearman's r

label_metrics <- full_join(worth, label_diff, by = "choice") %>%
  full_join(., label_var, by = "choice") %>%
  full_join(., label_CPC, by = "choice") %>%
  dplyr::select(-worth, -difference, -variance, -CPC, -choice) %>%
  pivot_longer(cols = 2:4, names_to = "measure", values_to = "rank") %>%
  mutate(measure = substr(measure, 6, nchar(measure))) %>%
  group_by(measure) %>%
  dplyr::summarise(RMSE = round(Metrics::rmse(rank_human, rank), 3),
                   Spearman = round(cor(rank_human, rank, method = "spearman"), 3))

#print table of RMSE and Spearman's r (Table S4)

label_metrics <- cbind(label_metrics[,1], round(label_metrics[,2:3], 3))

print(label_metrics)

###################################################################################
#Simulations for univariate distributions (2 clusters)
###################################################################################

#set seed

set.seed(123)

#generate Gaussian mixture distributions

MC_uni_ranvar <- list()
MC_uni_ranmean <- list()
MC_uni_variances <- c()
MC_uni_means <- c()

for (i in means_values) {
  for (j in 1:1000) {
    variance <- runif(1, min = 0.5, max = 2)
    data <- matrix(c(rnorm(500, -i, variance), rnorm(500, i, variance)), ncol = 1)
    cluster <- matrix(c(rep(1, 500), rep(2, 500)), ncol = 1)
    data <- cbind(data, cluster)
    MC_uni_ranvar[[length(MC_uni_ranvar) + 1]] <- data
    MC_uni_variances[[length(MC_uni_variances) + 1]] <- variance
  }
  base::print(i)
}

for (i in variance_values) {
  for (j in 1:1000) {
    mean <- runif(1, min = 2, max = 5)
    data <- matrix(c(rnorm(500, -mean, i), rnorm(500, mean, i)), ncol = 1)
    cluster <- matrix(c(rep(1, 500), rep(2, 500)), ncol = 1)
    data <- cbind(data, cluster)
    MC_uni_ranmean[[length(MC_uni_ranmean) + 1]] <- data
    MC_uni_means[[length(MC_uni_means) + 1]] <- mean
  }
  base::print(i)
}

#calculate various polarization scores for simulated distributions

MC_uni_ranvar_diff <- pblapply(MC_uni_ranvar,
                               function(x) diff_multidim(x, cols = 1, clusters = 2), cl = cores)
MC_uni_ranvar_var <- pblapply(MC_uni_ranvar, function(x) var(x[,1]), cl = cores)
MC_uni_ranvar_CPC <- pblapply(MC_uni_ranvar,
                              function(x) CPC(x, "manual", cols = 1, clusters = 2, adjust = TRUE),
                              cl = cores)

MC_uni_ranmean_diff <- pblapply(MC_uni_ranmean,
                                function(x) diff_multidim(x, cols = 1, clusters = 2), cl = cores)
MC_uni_ranmean_var <- pblapply(MC_uni_ranmean, function(x) var(x[,1]), cl = cores)
MC_uni_ranmean_CPC <- pblapply(MC_uni_ranmean,
                               function(x) CPC(x, "manual", cols = 1, clusters = 2, adjust = TRUE),
                               cl = cores)

#combine random variance polarization results into data frames, holding means constant within data frame for accurate rank calculations

MC_uni_ranvar1 <- as.data.frame(matrix(data = c(unlist(MC_uni_variances[1:1000]),
                                                unlist(MC_uni_ranvar_diff[1:1000]),
                                                unlist(MC_uni_ranvar_var[1:1000]),
                                                unlist(MC_uni_ranvar_CPC[1:1000])),
                                       nrow = 1000))
colnames(MC_uni_ranvar1) <- c("ranvar", "difference", "variance", "CPC")

MC_uni_ranvar2 <- as.data.frame(matrix(data = c(unlist(MC_uni_variances[1001:2000]),
                                                unlist(MC_uni_ranvar_diff[1001:2000]),
                                                unlist(MC_uni_ranvar_var[1001:2000]),
                                                unlist(MC_uni_ranvar_CPC[1001:2000])),
                                       nrow = 1000))
colnames(MC_uni_ranvar2) <- c("ranvar", "difference", "variance", "CPC")

MC_uni_ranvar3 <- as.data.frame(matrix(data = c(unlist(MC_uni_variances[2001:3000]),
                                                unlist(MC_uni_ranvar_diff[2001:3000]),
                                                unlist(MC_uni_ranvar_var[2001:3000]),
                                                unlist(MC_uni_ranvar_CPC[2001:3000])),
                                       nrow = 1000))
colnames(MC_uni_ranvar3) <- c("ranvar", "difference", "variance", "CPC")

MC_uni_ranvar4 <- as.data.frame(matrix(data = c(unlist(MC_uni_variances[3001:4000]),
                                                unlist(MC_uni_ranvar_diff[3001:4000]),
                                                unlist(MC_uni_ranvar_var[3001:4000]),
                                                unlist(MC_uni_ranvar_CPC[3001:4000])),
                                       nrow = 1000))
colnames(MC_uni_ranvar4) <- c("ranvar", "difference" ,"variance", "CPC")

#combine random mean polarization results into data frames, holding variance constant within data frame for accurate rank calculations

MC_uni_ranmean1 <- as.data.frame(matrix(data = c(unlist(MC_uni_means[1:1000]),
                                                 unlist(MC_uni_ranmean_diff[1:1000]),
                                                 unlist(MC_uni_ranmean_var[1:1000]),
                                                 unlist(MC_uni_ranmean_CPC[1:1000])),
                                        nrow = 1000))
colnames(MC_uni_ranmean1) <- c("ranmean", "difference", "variance", "CPC")

MC_uni_ranmean2 <- as.data.frame(matrix(data = c(unlist(MC_uni_means[1001:2000]),
                                                 unlist(MC_uni_ranmean_diff[1001:2000]),
                                                 unlist(MC_uni_ranmean_var[1001:2000]),
                                                 unlist(MC_uni_ranmean_CPC[1001:2000])),
                                        nrow = 1000))
colnames(MC_uni_ranmean2) <- c("ranmean", "difference", "variance", "CPC")

MC_uni_ranmean3 <- as.data.frame(matrix(data = c(unlist(MC_uni_means[2001:3000]),
                                                 unlist(MC_uni_ranmean_diff[2001:3000]),
                                                 unlist(MC_uni_ranmean_var[2001:3000]),
                                                 unlist(MC_uni_ranmean_CPC[2001:3000])),
                                        nrow = 1000))
colnames(MC_uni_ranmean3) <- c("ranmean", "difference", "variance", "CPC")

MC_uni_ranmean4 <- as.data.frame(matrix(data = c(unlist(MC_uni_means[3001:4000]),
                                                 unlist(MC_uni_ranmean_diff[3001:4000]),
                                                 unlist(MC_uni_ranmean_var[3001:4000]),
                                                 unlist(MC_uni_ranmean_CPC[3001:4000])),
                                        nrow = 1000))
colnames(MC_uni_ranmean4) <- c("ranmean", "difference", "variance", "CPC")

#merge data sets together for plotting

MC_uni_ranvar_all <- rbind(MC_uni_ranvar1, MC_uni_ranvar2, MC_uni_ranvar3, MC_uni_ranvar4)
MC_uni_ranmean_all <- rbind(MC_uni_ranmean1, MC_uni_ranmean2, MC_uni_ranmean3, MC_uni_ranmean4)

#add means and standard deviation labels and scale measures

ranvar_labels <- c(rep(c("means = (-5, 5)", "means = (-4, 4)", "means = (-3, 3)", "means = (-2, 2)"),
                       each = 1000))
ranmean_labels <- c(rep(c("sd = 0.5", "sd = 1", "sd = 1.5", "sd = 2"), each = 1000))

ranvar_labels <- factor(ranvar_labels, levels = c("means = (-2, 2)", "means = (-3, 3)",
                                                            "means = (-4, 4)", "means = (-5, 5)"))
ranmean_labels <- factor(ranmean_labels, levels = c("sd = 2", "sd = 1.5",
                                                                  "sd = 1", "sd = 0.5"))

levels(ranvar_labels) <- c("means = (-2, 2)" = TeX("$\\mu = \\{-2, 2\\}$"),
                           "means = (-3, 3)" = TeX("$\\mu = \\{-3, 3\\}$"),
                           "means = (-4, 4)" = TeX("$\\mu = \\{-4, 4\\}$"),
                           "means = (-5, 5)" = TeX("$\\mu = \\{-5, 5\\}$"))
levels(ranmean_labels) <- c("sd = 2" = TeX("$\\sigma = 2$"),
                            "sd = 1.5" = TeX("$\\sigma = 1.5$"),
                            "sd = 1" = TeX("$\\sigma = 1$"),
                            "sd = 0.5" = TeX("$\\sigma = 0.5$"))

MC_uni_ranvar_all <- cbind(MC_uni_ranvar_all, label = ranvar_labels)
MC_uni_ranmean_all <- cbind(MC_uni_ranmean_all, label = ranmean_labels)

MC_uni_ranvar_all <- mutate(MC_uni_ranvar_all,
                            difference = scales::rescale(difference, to = c(0, 1)),
                            variance = scales::rescale(variance, to = c(0, 1)),
                            CPC = scales::rescale(CPC, to = c(0, 1))) %>%
  pivot_longer(cols = 2:4, names_to = "Measure", values_to = "value") %>%
  mutate(Measure = ifelse(Measure == "difference", "Difference",
                          ifelse(Measure == "variance", "Variance", Measure)))

MC_uni_ranmean_all <- mutate(MC_uni_ranmean_all, ranmean = ranmean*2,
                             difference = scales::rescale(difference, to = c(0, 1)),
                             variance = scales::rescale(variance, to = c(0, 1)),
                             CPC = scales::rescale(CPC, to = c(0, 1))) %>%
  pivot_longer(cols = 2:4, names_to = "Measure", values_to = "value") %>%
  mutate(Measure = ifelse(Measure == "difference", "Difference",
                          ifelse(Measure == "variance", "Variance", Measure)))

#plot simulation results (Figure S2)

pdf("figureS2B.pdf", width = 9, height = 7)

ggplot(MC_uni_ranvar_all, aes(x = ranvar, y = value, linetype = Measure, color = Measure)) +
  geom_smooth(method = "loess", se = FALSE) +
  scale_linetype_manual(values = c("solid", "longdash", "dotted", "dotdash")) +
  scale_color_manual(values = c("black", "gray50", "gray50", "gray50")) +
  facet_wrap(~ label, labeller = label_parsed) +
  labs(x = "Component Standard Deviations", y = "Estimated Level of Polarization",
       title = TeX("B: Random $\\sigma$ (intragroup homogeneity)")) +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        axis.text = element_text(size = 16),
        plot.title = element_text(size = 22, hjust = -1))

dev.off()

pdf("figureS2A.pdf", width = 9, height = 7)

ggplot(MC_uni_ranmean_all, aes(x = ranmean, y = value, linetype = Measure, color = Measure)) +
  geom_smooth(method = "loess", se = FALSE) +
  scale_linetype_manual(values = c("solid", "longdash", "dotted", "dotdash")) +
  scale_color_manual(values = c("black", "gray50", "gray50", "gray50")) +
  facet_wrap(~ label, labeller = label_parsed) +
  labs(x = "Distance Between Component Means", y = "Estimated Level of Polarization",
       title = TeX("A: Random $\\mu$ (intergroup heterogeneity)")) +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        axis.text = element_text(size = 16),
        plot.title = element_text(size = 22, hjust = -1))

dev.off()

#calculate polarization ranks for random variance distributions

MC_uni_ranvar1 <- mutate(MC_uni_ranvar1, rank_true = rank(-ranvar),
                         difference = rank(difference),
                         variance = rank(variance),
                         CPC = rank(CPC),
                         mean_label = rep("means = (-5, 5)", 1000))
MC_uni_ranvar2 <- mutate(MC_uni_ranvar2, rank_true = rank(-ranvar),
                         difference = rank(difference),
                         variance = rank(variance),
                         CPC = rank(CPC),
                         mean_label = rep("means = (-4, 4)", 1000))
MC_uni_ranvar3 <- mutate(MC_uni_ranvar3, rank_true = rank(-ranvar),
                         difference = rank(difference),
                         variance = rank(variance),
                         CPC = rank(CPC),
                         mean_label = rep("means = (-3, 3)", 1000))
MC_uni_ranvar4 <- mutate(MC_uni_ranvar4, rank_true = rank(-ranvar),
                         difference = rank(difference),
                         variance = rank(variance),
                         CPC = rank(CPC),
                         mean_label = rep("means = (-2, 2)", 1000))

#calculate polarization ranks for random mean distributions

MC_uni_ranmean1 <- mutate(MC_uni_ranmean1, rank_true = rank(ranmean),
                          difference = rank(difference),
                          variance = rank(variance),
                          CPC = rank(CPC),
                          variance_label = rep("sd = 0.5", 1000))
MC_uni_ranmean2 <- mutate(MC_uni_ranmean2, rank_true = rank(ranmean),
                          difference = rank(difference),
                          variance = rank(variance),
                          CPC = rank(CPC),
                          variance_label = rep("sd = 1", 1000))
MC_uni_ranmean3 <- mutate(MC_uni_ranmean3, rank_true = rank(ranmean),
                          difference = rank(difference),
                          variance = rank(variance),
                          CPC = rank(CPC),
                          variance_label = rep("sd = 1.5", 1000))
MC_uni_ranmean4 <- mutate(MC_uni_ranmean4, rank_true = rank(ranmean),
                          difference = rank(difference),
                          variance = rank(variance),
                          CPC = rank(CPC),
                          variance_label = rep("sd = 2", 1000))

#combine polarization rank data into single data set

MC_uni_ranvar_plot <- rbind(MC_uni_ranvar1, MC_uni_ranvar2, MC_uni_ranvar3, MC_uni_ranvar4)
MC_uni_ranmean_plot <- rbind(MC_uni_ranmean1, MC_uni_ranmean2, MC_uni_ranmean3, MC_uni_ranmean4)

#tidy data and create labels for error calculation

MC_uni_ranvar_plot <- pivot_longer(MC_uni_ranvar_plot, cols = 2:4, names_to = "measure",
                                   values_to = "value") %>%
  mutate(measure = ifelse(measure == "difference", "Difference",
                          ifelse(measure == "variance", "Variance", measure)))
MC_uni_ranmean_plot <- pivot_longer(MC_uni_ranmean_plot, cols = 2:4, names_to = "measure",
                                    values_to = "value") %>%
  mutate(measure = ifelse(measure == "difference", "Difference",
                          ifelse(measure == "variance", "Variance", measure)))

MC_uni_ranvar_plot$mean_label <- factor(MC_uni_ranvar_plot$mean_label,
                                        levels = c("means = (-2, 2)", "means = (-3, 3)",
                                                   "means = (-4, 4)", "means = (-5, 5)"))
MC_uni_ranmean_plot$variance_label <- factor(MC_uni_ranmean_plot$variance_label,
                                             levels = c("sd = 2", "sd = 1.5", "sd = 1", "sd = 0.5"))

MC_uni_ranvar_plot$measure <- factor(MC_uni_ranvar_plot$measure,
                                     levels = c("Difference", "Variance", "CPC"))
MC_uni_ranmean_plot$measure <- factor(MC_uni_ranmean_plot$measure,
                                      levels = c("Difference", "Variance", "CPC"))

###################################################################################
#Simulations for bivariate distributions (2 clusters)
###################################################################################

#generate Gaussian mixture distributions

MC_bi_ranvar <- list()
MC_bi_ranmean <- list()
MC_bi_variances <- c()
MC_bi_means <- c()

for (i in means_values) {
  for (j in 1:1000) {
    variance <- runif(1, min = 0.5, max = 2)
    data <- as.matrix(rbind(mvrnorm(500, c(i, -i),
                                    matrix(c(variance^2, 0, 0, variance^2), ncol = 2, byrow = TRUE)),
                            mvrnorm(500, c(-i, i), matrix(c(variance^2, 0, 0, variance^2),
                                                          ncol = 2, byrow = TRUE))))
    cluster <- matrix(c(rep(1, 500), rep(2, 500)), ncol = 1)
    data <- cbind(data, cluster)
    MC_bi_ranvar[[length(MC_bi_ranvar) + 1]] <- data
    MC_bi_variances[[length(MC_bi_variances) + 1]] <- variance
  }
  base::print(i)
}

for (i in variance_values) {
  for (j in 1:1000) {
    mean <- runif(1, min = 2, max = 5)
    data <- as.matrix(rbind(mvrnorm(500, c(mean, -mean),
                                    matrix(c(i^2, 0, 0, i^2), ncol = 2, byrow = TRUE)),
                            mvrnorm(500, c(-mean, mean), matrix(c(i^2, 0, 0, i^2), ncol = 2,
                                                                byrow = TRUE))))
    cluster <- matrix(c(rep(1, 500), rep(2, 500)), ncol = 1)
    data <- cbind(data, cluster)
    MC_bi_ranmean[[length(MC_bi_ranmean)+1]] <- data
    MC_bi_means[[length(MC_bi_means)+1]] <- mean
  }
  base::print(i)
}

#calculate various polarization scores for simulated distributions

MC_bi_ranvar_diff <- pblapply(MC_bi_ranvar,
                              function(x) diff_multidim(data = x, cols = 1:2, clusters = 3),
                              cl = cores)
MC_bi_ranvar_var <- pblapply(MC_bi_ranvar, function(x) tr(cov(x[,1:2], use = "complete.obs")),
                             cl = cores)
MC_bi_ranvar_CPC <- pblapply(MC_bi_ranvar,
                             function(x) CPC(x, "manual", cols = 1:2, clusters = 3, adjust = TRUE),
                             cl = cores)

MC_bi_ranmean_diff <- pblapply(MC_bi_ranmean,
                               function(x) diff_multidim(data = x, cols = 1:2, clusters = 3),
                               cl = cores)
MC_bi_ranmean_var <- pblapply(MC_bi_ranmean, function(x) tr(cov(x[,1:2], use = "complete.obs")),
                              cl = cores)
MC_bi_ranmean_CPC <- pblapply(MC_bi_ranmean,
                              function(x) CPC(x, "manual", cols = 1:2, clusters = 3, adjust = TRUE),
                              cl = cores)

#combine random variance polarization results into data frames, holding means constant within data frame for accurate rank calculations

MC_bi_ranvar1 <- as.data.frame(matrix(data = c(unlist(MC_bi_variances[1:1000]),
                                               unlist(MC_bi_ranvar_diff[1:1000]),
                                               unlist(MC_bi_ranvar_var[1:1000]),
                                               unlist(MC_bi_ranvar_CPC[1:1000])),
                                      nrow = 1000))
colnames(MC_bi_ranvar1) <- c("ranvar", "difference", "variance", "CPC")

MC_bi_ranvar2 <- as.data.frame(matrix(data = c(unlist(MC_bi_variances[1001:2000]),
                                               unlist(MC_bi_ranvar_diff[1001:2000]),
                                               unlist(MC_bi_ranvar_var[1001:2000]),
                                               unlist(MC_bi_ranvar_CPC[1001:2000])),
                                      nrow = 1000))
colnames(MC_bi_ranvar2) <- c("ranvar", "difference", "variance", "CPC")

MC_bi_ranvar3 <- as.data.frame(matrix(data = c(unlist(MC_bi_variances[2001:3000]),
                                               unlist(MC_bi_ranvar_diff[2001:3000]),
                                               unlist(MC_bi_ranvar_var[2001:3000]),
                                               unlist(MC_bi_ranvar_CPC[2001:3000])),
                                      nrow = 1000))
colnames(MC_bi_ranvar3) <- c("ranvar", "difference", "variance", "CPC")

MC_bi_ranvar4 <- as.data.frame(matrix(data = c(unlist(MC_bi_variances[3001:4000]),
                                               unlist(MC_bi_ranvar_diff[3001:4000]),
                                               unlist(MC_bi_ranvar_var[3001:4000]),
                                               unlist(MC_bi_ranvar_CPC[3001:4000])),
                                      nrow = 1000))
colnames(MC_bi_ranvar4) <- c("ranvar", "difference", "variance", "CPC")

#combine random mean polarization results into data frames, holding variance constant within data frame for accurate rank calculations

MC_bi_ranmean1 <- as.data.frame(matrix(data = c(unlist(MC_bi_means[1:1000]),
                                                unlist(MC_bi_ranmean_diff[1:1000]),
                                                unlist(MC_bi_ranmean_var[1:1000]),
                                                unlist(MC_bi_ranmean_CPC[1:1000])),
                                       nrow = 1000))
colnames(MC_bi_ranmean1) <- c("ranmean", "difference", "variance", "CPC")

MC_bi_ranmean2 <- as.data.frame(matrix(data = c(unlist(MC_bi_means[1001:2000]),
                                                unlist(MC_bi_ranmean_diff[1001:2000]),
                                                unlist(MC_bi_ranmean_var[1001:2000]),
                                                unlist(MC_bi_ranmean_CPC[1001:2000])),
                                       nrow = 1000))
colnames(MC_bi_ranmean2) <- c("ranmean", "difference", "variance", "CPC")

MC_bi_ranmean3 <- as.data.frame(matrix(data = c(unlist(MC_bi_means[2001:3000]),
                                                unlist(MC_bi_ranmean_diff[2001:3000]),
                                                unlist(MC_bi_ranmean_var[2001:3000]),
                                                unlist(MC_bi_ranmean_CPC[2001:3000])),
                                       nrow = 1000))
colnames(MC_bi_ranmean3) <- c("ranmean", "difference", "variance", "CPC")

MC_bi_ranmean4 <- as.data.frame(matrix(data = c(unlist(MC_bi_means[3001:4000]),
                                                unlist(MC_bi_ranmean_diff[3001:4000]),
                                                unlist(MC_bi_ranmean_var[3001:4000]),
                                                unlist(MC_bi_ranmean_CPC[3001:4000])),
                                       nrow = 1000))
colnames(MC_bi_ranmean4) <- c("ranmean", "difference", "variance", "CPC")

#merge data sets together for plotting

MC_bi_ranvar_all <- rbind(MC_bi_ranvar1, MC_bi_ranvar2, MC_bi_ranvar3, MC_bi_ranvar4)
MC_bi_ranmean_all <- rbind(MC_bi_ranmean1, MC_bi_ranmean2, MC_bi_ranmean3, MC_bi_ranmean4)

#add means and standard deviation labels and scale measures

ranvar_labels <- c(rep(c("means = (-5, 5)", "means = (-4, 4)", "means = (-3, 3)", "means = (-2, 2)"),
                       each = 1000))
ranmean_labels <- c(rep(c("sd = 0.5", "sd = 1", "sd = 1.5", "sd = 2"), each = 1000))

ranvar_labels <- factor(ranvar_labels, levels = c("means = (-2, 2)", "means = (-3, 3)",
                                                  "means = (-4, 4)", "means = (-5, 5)"))
ranmean_labels <- factor(ranmean_labels, levels = c("sd = 2", "sd = 1.5",
                                                    "sd = 1", "sd = 0.5"))

levels(ranvar_labels) <- c("means = (-2, 2)" = TeX("$\\mu = \\{-2, 2\\}$"),
                           "means = (-3, 3)" = TeX("$\\mu = \\{-3, 3\\}$"),
                           "means = (-4, 4)" = TeX("$\\mu = \\{-4, 4\\}$"),
                           "means = (-5, 5)" = TeX("$\\mu = \\{-5, 5\\}$"))
levels(ranmean_labels) <- c("sd = 2" = TeX("$\\sigma = 2$"),
                            "sd = 1.5" = TeX("$\\sigma = 1.5$"),
                            "sd = 1" = TeX("$\\sigma = 1$"),
                            "sd = 0.5" = TeX("$\\sigma = 0.5$"))

MC_bi_ranvar_all <- cbind(MC_bi_ranvar_all, label = ranvar_labels)
MC_bi_ranmean_all <- cbind(MC_bi_ranmean_all, label = ranmean_labels)

MC_bi_ranvar_all <- mutate(MC_bi_ranvar_all,
                           difference = scales::rescale(difference, to = c(0, 1)),
                           variance = scales::rescale(variance, to = c(0, 1)),
                           CPC = scales::rescale(CPC, to = c(0, 1))) %>%
  pivot_longer(cols = 2:4, names_to = "Measure", values_to = "value") %>%
  mutate(Measure = ifelse(Measure == "difference", "Difference",
                          ifelse(Measure == "variance", "Variance", Measure)))

MC_bi_ranmean_all <- mutate(MC_bi_ranmean_all, ranmean = ranmean*2,
                            difference = scales::rescale(difference, to = c(0, 1)),
                            variance = scales::rescale(variance, to = c(0, 1)),
                            CPC = scales::rescale(CPC, to = c(0, 1))) %>%
  pivot_longer(cols = 2:4, names_to = "Measure", values_to = "value") %>%
  mutate(Measure = ifelse(Measure == "difference", "Difference",
                          ifelse(Measure == "variance", "Variance", Measure)))

#plot simulation results (Figure S3)

pdf("figureS3B.pdf", width = 9, height = 7)

ggplot(MC_bi_ranvar_all, aes(x = ranvar, y = value, linetype = Measure, color = Measure)) +
  geom_smooth(method = "loess", se = FALSE) +
  scale_linetype_manual(values = c("solid", "longdash", "dotted", "dotdash")) +
  scale_color_manual(values = c("black", "gray50", "gray50", "gray50")) +
  facet_wrap(~ label, labeller = label_parsed) +
  labs(x = "Component Standard Deviations", y = "Estimated Level of Polarization",
       title = TeX("B: Random $\\sigma$ (intragroup homogeneity)")) +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        axis.text = element_text(size = 16),
        plot.title = element_text(size = 22, hjust = -1))

dev.off()

pdf("figureS3A.pdf", width = 9, height = 7)

ggplot(MC_bi_ranmean_all, aes(x = ranmean, y = value, linetype = Measure, color = Measure)) +
  geom_smooth(method = "loess", se = FALSE) +
  scale_linetype_manual(values = c("solid", "longdash", "dotted", "dotdash")) +
  scale_color_manual(values = c("black", "gray50", "gray50", "gray50")) +
  facet_wrap(~ label, labeller = label_parsed) +
  labs(x = "Distance Between Component Means", y = "Estimated Level of Polarization",
       title = TeX("A: Random $\\mu$ (intergroup heterogeneity)")) +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        axis.text = element_text(size = 16),
        plot.title = element_text(size = 22, hjust = -1))

dev.off()

#calculate polarization ranks for random variance distributions

MC_bi_ranvar1 <- mutate(MC_bi_ranvar1, rank_true = rank(-ranvar),
                        difference = rank(difference),
                        variance = rank(variance),
                        CPC = rank(CPC),
                        mean_label = rep("means = (-5, 5)", 1000))
MC_bi_ranvar2 <- mutate(MC_bi_ranvar2, rank_true = rank(-ranvar),
                        difference = rank(difference),
                        variance = rank(variance),
                        CPC = rank(CPC),
                        mean_label = rep("means = (-4, 4)", 1000))
MC_bi_ranvar3 <- mutate(MC_bi_ranvar3, rank_true = rank(-ranvar),
                        difference = rank(difference),
                        variance = rank(variance),
                        CPC = rank(CPC),
                        mean_label = rep("means = (-3, 3)", 1000))
MC_bi_ranvar4 <- mutate(MC_bi_ranvar4, rank_true = rank(-ranvar),
                        difference = rank(difference),
                        variance = rank(variance),
                        CPC = rank(CPC),
                        mean_label = rep("means = (-2, 2)", 1000))

#calculate polarization ranks for random mean distributions

MC_bi_ranmean1 <- mutate(MC_bi_ranmean1, rank_true = rank(ranmean),
                         difference = rank(difference),
                         variance = rank(variance),
                         CPC = rank(CPC),
                         variance_label = rep("sd = 0.5", 1000))
MC_bi_ranmean2 <- mutate(MC_bi_ranmean2, rank_true = rank(ranmean),
                         difference = rank(difference),
                         variance = rank(variance),
                         CPC = rank(CPC),
                         variance_label = rep("sd = 1", 1000))
MC_bi_ranmean3 <- mutate(MC_bi_ranmean3, rank_true = rank(ranmean),
                         difference = rank(difference),
                         variance = rank(variance),
                         CPC = rank(CPC),
                         variance_label = rep("sd = 1.5", 1000))
MC_bi_ranmean4 <- mutate(MC_bi_ranmean4, rank_true = rank(ranmean),
                         difference = rank(difference),
                         variance = rank(variance),
                         CPC = rank(CPC),
                         variance_label = rep("sd = 2", 1000))

#combine polarization rank data into single data set

MC_bi_ranvar_plot <- rbind(MC_bi_ranvar1, MC_bi_ranvar2, MC_bi_ranvar3, MC_bi_ranvar4)
MC_bi_ranmean_plot <- rbind(MC_bi_ranmean1, MC_bi_ranmean2, MC_bi_ranmean3, MC_bi_ranmean4)

#tidy data and create labels for error calculation

MC_bi_ranvar_plot <- pivot_longer(MC_bi_ranvar_plot, cols = 2:4,
                                  names_to = "measure", values_to = "value") %>%
  mutate(measure = ifelse(measure == "difference", "Difference",
                          ifelse(measure == "variance", "Variance", measure)))
MC_bi_ranmean_plot <- pivot_longer(MC_bi_ranmean_plot, cols = 2:4,
                                   names_to = "measure", values_to = "value") %>%
  mutate(measure = ifelse(measure == "difference", "Difference",
                          ifelse(measure == "variance", "Variance", measure)))

MC_bi_ranvar_plot$mean_label <- factor(MC_bi_ranvar_plot$mean_label,
                                       levels = c("means = (-2, 2)", "means = (-3, 3)",
                                                  "means = (-4, 4)", "means = (-5, 5)"))
MC_bi_ranmean_plot$variance_label <- factor(MC_bi_ranmean_plot$variance_label,
                                            levels = c("sd = 2", "sd = 1.5", "sd = 1", "sd = 0.5"))
MC_bi_ranvar_plot$measure <- factor(MC_bi_ranvar_plot$measure,
                                    levels = c("Difference", "Variance", "CPC"))
MC_bi_ranmean_plot$measure <- factor(MC_bi_ranmean_plot$measure,
                                     levels = c("Difference", "Variance", "CPC"))

###################################################################################
#Simulations for univariate distributions (3 clusters)
###################################################################################

#set seed

set.seed(123)

#generate Gaussian mixture distributions

MC3_uni_ranvar <- list()
MC3_uni_ranmean <- list()
MC3_uni_variances <- c()
MC3_uni_means <- c()

for (i in means_values) {
  for (j in 1:1000) {
    variance <- runif(1, min = 0.5, max = 2)
    data <- matrix(c(rnorm(333, -i, variance), rnorm(334, 0, variance), rnorm(333, i, variance)),
                   ncol = 1)
    cluster <- matrix(c(rep(1, 333), rep(2, 334), rep(3, 333)), ncol = 1)
    data <- cbind(data, cluster)
    MC3_uni_ranvar[[length(MC3_uni_ranvar) + 1]] <- data
    MC3_uni_variances[[length(MC3_uni_variances) + 1]] <- variance
  }
  base::print(i)
}

for (i in variance_values) {
  for (j in 1:1000) {
    mean <- runif(1, min = 2, max = 5)
    data <- matrix(c(rnorm(333, -mean, i), rnorm(334, 0, i), rnorm(333, mean, i)),
                   ncol = 1)
    cluster <- matrix(c(rep(1, 333), rep(2, 334), rep(3, 333)), ncol = 1)
    data <- cbind(data, cluster)
    MC3_uni_ranmean[[length(MC3_uni_ranmean) + 1]] <- data
    MC3_uni_means[[length(MC3_uni_means) + 1]] <- mean
  }
  base::print(i)
}

#calculate various polarization scores for simulated distributions

MC3_uni_ranvar_diff <- pblapply(MC3_uni_ranvar,
                                function(x) diff_multidim(x, cols = 1, clusters = 2), cl = cores)
MC3_uni_ranvar_var <- pblapply(MC3_uni_ranvar, function(x) var(x[,1]), cl = cores)
MC3_uni_ranvar_CPC <- pblapply(MC3_uni_ranvar,
                               function(x) CPC(x, "manual", cols = 1, clusters = 2, adjust = TRUE),
                               cl = cores)

MC3_uni_ranmean_diff <- pblapply(MC3_uni_ranmean,
                                 function(x) diff_multidim(x, cols = 1, clusters = 2),
                                 cl = cores)
MC3_uni_ranmean_var <- pblapply(MC3_uni_ranmean, function(x) var(x[,1]), cl = cores)
MC3_uni_ranmean_CPC <- pblapply(MC3_uni_ranmean,
                                function(x) CPC(x, "manual", cols = 1, clusters = 2, adjust = TRUE),
                                cl = cores)

#combine random variance polarization results into data frames, holding means constant within data frame for accurate rank calculations

MC3_uni_ranvar1 <- as.data.frame(matrix(data = c(unlist(MC3_uni_variances[1:1000]),
                                                 unlist(MC3_uni_ranvar_diff[1:1000]),
                                                 unlist(MC3_uni_ranvar_var[1:1000]),
                                                 unlist(MC3_uni_ranvar_CPC[1:1000])),
                                        nrow = 1000))
colnames(MC3_uni_ranvar1) <- c("ranvar", "difference", "variance", "CPC")

MC3_uni_ranvar2 <- as.data.frame(matrix(data = c(unlist(MC3_uni_variances[1001:2000]),
                                                 unlist(MC3_uni_ranvar_diff[1001:2000]),
                                                 unlist(MC3_uni_ranvar_var[1001:2000]),
                                                 unlist(MC3_uni_ranvar_CPC[1001:2000])),
                                        nrow = 1000))
colnames(MC3_uni_ranvar2) <- c("ranvar", "difference", "variance", "CPC")

MC3_uni_ranvar3 <- as.data.frame(matrix(data = c(unlist(MC3_uni_variances[2001:3000]),
                                                 unlist(MC3_uni_ranvar_diff[2001:3000]),
                                                 unlist(MC3_uni_ranvar_var[2001:3000]),
                                                 unlist(MC3_uni_ranvar_CPC[2001:3000])),
                                        nrow = 1000))
colnames(MC3_uni_ranvar3) <- c("ranvar", "difference", "variance", "CPC")

MC3_uni_ranvar4 <- as.data.frame(matrix(data = c(unlist(MC3_uni_variances[3001:4000]),
                                                 unlist(MC3_uni_ranvar_diff[3001:4000]),
                                                 unlist(MC3_uni_ranvar_var[3001:4000]),
                                                 unlist(MC3_uni_ranvar_CPC[3001:4000])),
                                        nrow = 1000))
colnames(MC3_uni_ranvar4) <- c("ranvar", "difference", "variance", "CPC")

#combine random mean polarization results into data frames, holding variance constant within data frame for accurate rank calculations

MC3_uni_ranmean1 <- as.data.frame(matrix(data = c(unlist(MC3_uni_means[1:1000]),
                                                  unlist(MC3_uni_ranmean_diff[1:1000]),
                                                  unlist(MC3_uni_ranmean_var[1:1000]),
                                                  unlist(MC3_uni_ranmean_CPC[1:1000])),
                                         nrow = 1000))
colnames(MC3_uni_ranmean1) <- c("ranmean", "difference", "variance", "CPC")

MC3_uni_ranmean2 <- as.data.frame(matrix(data = c(unlist(MC3_uni_means[1001:2000]),
                                                  unlist(MC3_uni_ranmean_diff[1001:2000]),
                                                  unlist(MC3_uni_ranmean_var[1001:2000]),
                                                  unlist(MC3_uni_ranmean_CPC[1001:2000])),
                                         nrow = 1000))
colnames(MC3_uni_ranmean2) <- c("ranmean", "difference", "variance", "CPC")

MC3_uni_ranmean3 <- as.data.frame(matrix(data = c(unlist(MC3_uni_means[2001:3000]),
                                                  unlist(MC3_uni_ranmean_diff[2001:3000]),
                                                  unlist(MC3_uni_ranmean_var[2001:3000]),
                                                  unlist(MC3_uni_ranmean_CPC[2001:3000])),
                                         nrow = 1000))
colnames(MC3_uni_ranmean3) <- c("ranmean", "difference", "variance", "CPC")

MC3_uni_ranmean4 <- as.data.frame(matrix(data = c(unlist(MC3_uni_means[3001:4000]),
                                                  unlist(MC3_uni_ranmean_diff[3001:4000]),
                                                  unlist(MC3_uni_ranmean_var[3001:4000]),
                                                  unlist(MC3_uni_ranmean_CPC[3001:4000])),
                                         nrow = 1000))
colnames(MC3_uni_ranmean4) <- c("ranmean", "difference", "variance", "CPC")

#merge data sets together for plotting

MC3_uni_ranvar_all <- rbind(MC3_uni_ranvar1, MC3_uni_ranvar2, MC3_uni_ranvar3, MC3_uni_ranvar4)
MC3_uni_ranmean_all <- rbind(MC3_uni_ranmean1, MC3_uni_ranmean2, MC3_uni_ranmean3, MC3_uni_ranmean4)

#add means and standard deviation labels and scale measures

ranvar_labels <- c(rep(c("means = (-5, 5)", "means = (-4, 4)", "means = (-3, 3)", "means = (-2, 2)"),
                       each = 1000))
ranmean_labels <- c(rep(c("sd = 0.5", "sd = 1", "sd = 1.5", "sd = 2"), each = 1000))

ranvar_labels <- factor(ranvar_labels, levels = c("means = (-2, 2)", "means = (-3, 3)",
                                                  "means = (-4, 4)", "means = (-5, 5)"))
ranmean_labels <- factor(ranmean_labels, levels = c("sd = 2", "sd = 1.5",
                                                    "sd = 1", "sd = 0.5"))

levels(ranvar_labels) <- c("means = (-2, 2)" = TeX("$\\mu = \\{-2, 2\\}$"),
                           "means = (-3, 3)" = TeX("$\\mu = \\{-3, 3\\}$"),
                           "means = (-4, 4)" = TeX("$\\mu = \\{-4, 4\\}$"),
                           "means = (-5, 5)" = TeX("$\\mu = \\{-5, 5\\}$"))
levels(ranmean_labels) <- c("sd = 2" = TeX("$\\sigma = 2$"),
                            "sd = 1.5" = TeX("$\\sigma = 1.5$"),
                            "sd = 1" = TeX("$\\sigma = 1$"),
                            "sd = 0.5" = TeX("$\\sigma = 0.5$"))

MC3_uni_ranvar_all <- cbind(MC3_uni_ranvar_all, label = ranvar_labels)
MC3_uni_ranmean_all <- cbind(MC3_uni_ranmean_all, label = ranmean_labels)

MC3_uni_ranvar_all <- mutate(MC3_uni_ranvar_all,
                             difference = scales::rescale(difference, to = c(0, 1)),
                             variance = scales::rescale(variance, to = c(0, 1)),
                             CPC = scales::rescale(CPC, to = c(0, 1))) %>%
  pivot_longer(cols = 2:4, names_to = "Measure", values_to = "value") %>%
  mutate(Measure = ifelse(Measure == "difference", "Difference",
                          ifelse(Measure == "variance", "Variance", Measure)))

MC3_uni_ranmean_all <- mutate(MC3_uni_ranmean_all, ranmean = ranmean*2,
                              difference = scales::rescale(difference, to = c(0, 1)),
                              variance = scales::rescale(variance, to = c(0, 1)),
                              CPC = scales::rescale(CPC, to = c(0, 1))) %>%
  pivot_longer(cols = 2:4, names_to = "Measure", values_to = "value") %>%
  mutate(Measure = ifelse(Measure == "difference", "Difference",
                          ifelse(Measure == "variance", "Variance", Measure)))

#plot simulation results (Figure R1)

pdf("figureR1B.pdf", width = 9, height = 7)

ggplot(MC3_uni_ranvar_all, aes(x = ranvar, y = value, linetype = Measure, color = Measure)) +
  geom_smooth(method = "loess", se = FALSE) +
  scale_linetype_manual(values = c("solid", "longdash", "dotted", "dotdash")) +
  scale_color_manual(values = c("black", "gray50", "gray50", "gray50")) +
  facet_wrap(~ label, labeller = label_parsed) +
  labs(x = "Component Standard Deviations", y = "Estimated Level of Polarization",
       title = TeX("B: Random $\\sigma$ (intragroup homogeneity)")) +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        axis.text = element_text(size = 16),
        plot.title = element_text(size = 22, hjust = -1))

dev.off()

pdf("figureR1A.pdf", width = 9, height = 7)

ggplot(MC3_uni_ranmean_all, aes(x = ranmean, y = value, linetype = Measure, color = Measure)) +
  geom_smooth(method = "loess", se = FALSE) +
  scale_linetype_manual(values = c("solid", "longdash", "dotted", "dotdash")) +
  scale_color_manual(values = c("black", "gray50", "gray50", "gray50")) +
  facet_wrap(~ label, labeller = label_parsed) +
  labs(x = "Distance Between Component Means", y = "Estimated Level of Polarization",
       title = TeX("A: Random $\\mu$ (intergroup heterogeneity)")) +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        axis.text = element_text(size = 16),
        plot.title = element_text(size = 22, hjust = -1))

dev.off()

#calculate polarization ranks for random variance distributions

MC3_uni_ranvar1 <- mutate(MC3_uni_ranvar1, rank_true = rank(-ranvar),
                          difference = rank(difference),
                          variance = rank(variance),
                          CPC = rank(CPC),
                          mean_label = rep("means = (-5, 5)", 1000))
MC3_uni_ranvar2 <- mutate(MC3_uni_ranvar2, rank_true = rank(-ranvar),
                          difference = rank(difference),
                          variance = rank(variance),
                          CPC = rank(CPC),
                          mean_label = rep("means = (-4, 4)", 1000))
MC3_uni_ranvar3 <- mutate(MC3_uni_ranvar3, rank_true = rank(-ranvar),
                          difference = rank(difference),
                          variance = rank(variance),
                          CPC = rank(CPC),
                          mean_label = rep("means = (-3, 3)", 1000))
MC3_uni_ranvar4 <- mutate(MC3_uni_ranvar4, rank_true = rank(-ranvar),
                          difference = rank(difference),
                          variance = rank(variance),
                          CPC = rank(CPC),
                          mean_label = rep("means = (-2, 2)", 1000))

#calculate polarization ranks for random mean distributions

MC3_uni_ranmean1 <- mutate(MC3_uni_ranmean1, rank_true = rank(ranmean),
                           difference = rank(difference),
                           variance = rank(variance),
                           CPC = rank(CPC),
                           variance_label = rep("sd = 0.5", 1000))
MC3_uni_ranmean2 <- mutate(MC3_uni_ranmean2, rank_true = rank(ranmean),
                           difference = rank(difference),
                           variance = rank(variance),
                           CPC = rank(CPC),
                           variance_label = rep("sd = 1", 1000))
MC3_uni_ranmean3 <- mutate(MC3_uni_ranmean3, rank_true = rank(ranmean),
                           difference = rank(difference),
                           variance = rank(variance),
                           CPC = rank(CPC),
                           variance_label = rep("sd = 1.5", 1000))
MC3_uni_ranmean4 <- mutate(MC3_uni_ranmean4, rank_true = rank(ranmean),
                           difference = rank(difference),
                           variance = rank(variance),
                           CPC = rank(CPC),
                           variance_label = rep("sd = 2", 1000))

#combine polarization rank data into single data set

MC3_uni_ranvar_plot <- rbind(MC3_uni_ranvar1, MC3_uni_ranvar2, MC3_uni_ranvar3, MC3_uni_ranvar4)
MC3_uni_ranmean_plot <- rbind(MC3_uni_ranmean1, MC3_uni_ranmean2, MC3_uni_ranmean3, MC3_uni_ranmean4)

#tidy data and create labels for error calculation

MC3_uni_ranvar_plot <- pivot_longer(MC3_uni_ranvar_plot, cols = 2:4,
                                    names_to = "measure", values_to = "value") %>%
  mutate(measure = ifelse(measure == "difference", "Difference",
                          ifelse(measure == "variance", "Variance", measure)))
MC3_uni_ranmean_plot <- pivot_longer(MC3_uni_ranmean_plot, cols = 2:4,
                                     names_to = "measure", values_to = "value") %>%
  mutate(measure = ifelse(measure == "difference", "Difference",
                          ifelse(measure == "variance", "Variance", measure)))

MC3_uni_ranvar_plot$mean_label <- factor(MC3_uni_ranvar_plot$mean_label,
                                         levels = c("means = (-2, 2)", "means = (-3, 3)",
                                                    "means = (-4, 4)", "means = (-5, 5)"))
MC3_uni_ranmean_plot$variance_label <- factor(MC3_uni_ranmean_plot$variance_label,
                                              levels = c("sd = 2", "sd = 1.5", "sd = 1", "sd = 0.5"))

MC3_uni_ranvar_plot$measure <- factor(MC3_uni_ranvar_plot$measure,
                                      levels = c("Difference", "Variance", "CPC"))
MC3_uni_ranmean_plot$measure <- factor(MC3_uni_ranmean_plot$measure,
                                       levels = c("Difference", "Variance", "CPC"))

###################################################################################
#Simulations for bivariate distributions (3 clusters)
###################################################################################

#generate Gaussian mixture distributions

MC3_bi_ranvar <- list()
MC3_bi_ranmean <- list()
MC3_bi_variances <- c()
MC3_bi_means <- c()

for (i in means_values) {
  for (j in 1:1000) {
    variance <- runif(1, min = 0.5, max = 2)
    data <- as.matrix(rbind(mvrnorm(333, c(i, -i),
                                    matrix(c(variance^2, 0, 0, variance^2), ncol = 2, byrow = TRUE)),
                            mvrnorm(334, c(0, i),
                                    matrix(c(variance^2, 0, 0, variance^2), ncol = 2, byrow = TRUE)),
                            mvrnorm(333, c(-i, -i),
                                    matrix(c(variance^2, 0, 0, variance^2), ncol = 2, byrow = TRUE))))
    cluster <- matrix(c(rep(1, 333), rep(2, 334), rep(3, 333)), ncol = 1)
    data <- cbind(data, cluster)
    MC3_bi_ranvar[[length(MC3_bi_ranvar) + 1]] <- data
    MC3_bi_variances[[length(MC3_bi_variances) + 1]] <- variance
  }
  base::print(i)
}

for (i in variance_values) {
  for (j in 1:1000) {
    mean <- runif(1, min = 2, max = 5)
    data <- as.matrix(rbind(mvrnorm(333, c(mean, -mean),
                                    matrix(c(i^2, 0, 0, i^2), ncol = 2, byrow = TRUE)),
                            mvrnorm(334, c(0, mean),
                                    matrix(c(i^2, 0, 0, i^2), ncol = 2, byrow = TRUE)),
                            mvrnorm(333, c(-mean, -mean),
                                    matrix(c(i^2, 0, 0, i^2), ncol = 2, byrow = TRUE))))
    cluster <- matrix(c(rep(1, 333), rep(2, 334), rep(3, 333)), ncol = 1)
    data <- cbind(data, cluster)
    MC3_bi_ranmean[[length(MC3_bi_ranmean)+1]] <- data
    MC3_bi_means[[length(MC3_bi_means)+1]] <- mean
  }
  base::print(i)
}

#calculate various polarization scores for simulated distributions

MC3_bi_ranvar_diff <- pblapply(MC3_bi_ranvar,
                               function(x) diff_multidim(data = x, cols = 1:2, clusters = 3),
                               cl = cores)
MC3_bi_ranvar_var <- pblapply(MC3_bi_ranvar, function(x) tr(cov(x[,1:2], use = "complete.obs")),
                              cl = cores)
MC3_bi_ranvar_CPC <- pblapply(MC3_bi_ranvar,
                              function(x) CPC(x, "manual", cols = 1:2, clusters = 3, adjust = TRUE),
                              cl = cores)

MC3_bi_ranmean_diff <- pblapply(MC3_bi_ranmean,
                                function(x) diff_multidim(data = x, cols = 1:2, clusters = 3),
                                cl = cores)
MC3_bi_ranmean_var <- pblapply(MC3_bi_ranmean, function(x) tr(cov(x[,1:2], use = "complete.obs")),
                               cl = cores)
MC3_bi_ranmean_CPC <- pblapply(MC3_bi_ranmean,
                               function(x) CPC(x, "manual", cols = 1:2, clusters = 3, adjust = TRUE),
                               cl = cores)

#combine random variance polarization results into data frames, holding means constant within data frame for accurate rank calculations

MC3_bi_ranvar1 <- as.data.frame(matrix(data = c(unlist(MC3_bi_variances[1:1000]),
                                                unlist(MC3_bi_ranvar_diff[1:1000]),
                                                unlist(MC3_bi_ranvar_var[1:1000]),
                                                unlist(MC3_bi_ranvar_CPC[1:1000])),
                                       nrow = 1000))
colnames(MC3_bi_ranvar1) <- c("ranvar", "difference", "variance", "CPC")

MC3_bi_ranvar2 <- as.data.frame(matrix(data = c(unlist(MC3_bi_variances[1001:2000]),
                                                unlist(MC3_bi_ranvar_diff[1001:2000]),
                                                unlist(MC3_bi_ranvar_var[1001:2000]),
                                                unlist(MC3_bi_ranvar_CPC[1001:2000])),
                                       nrow = 1000))
colnames(MC3_bi_ranvar2) <- c("ranvar", "difference", "variance", "CPC")

MC3_bi_ranvar3 <- as.data.frame(matrix(data = c(unlist(MC3_bi_variances[2001:3000]),
                                                unlist(MC3_bi_ranvar_diff[2001:3000]),
                                                unlist(MC3_bi_ranvar_var[2001:3000]),
                                                unlist(MC3_bi_ranvar_CPC[2001:3000])),
                                       nrow = 1000))
colnames(MC3_bi_ranvar3) <- c("ranvar", "difference", "variance", "CPC")

MC3_bi_ranvar4 <- as.data.frame(matrix(data = c(unlist(MC3_bi_variances[3001:4000]),
                                                unlist(MC3_bi_ranvar_diff[3001:4000]),
                                                unlist(MC3_bi_ranvar_var[3001:4000]),
                                                unlist(MC3_bi_ranvar_CPC[3001:4000])),
                                       nrow = 1000))
colnames(MC3_bi_ranvar4) <- c("ranvar", "difference", "variance", "CPC")

#combine random mean polarization results into data frames, holding variance constant within data frame for accurate rank calculations

MC3_bi_ranmean1 <- as.data.frame(matrix(data = c(unlist(MC3_bi_means[1:1000]),
                                                 unlist(MC3_bi_ranmean_diff[1:1000]),
                                                 unlist(MC3_bi_ranmean_var[1:1000]),
                                                 unlist(MC3_bi_ranmean_CPC[1:1000])),
                                        nrow = 1000))
colnames(MC3_bi_ranmean1) <- c("ranmean", "difference", "variance", "CPC")

MC3_bi_ranmean2 <- as.data.frame(matrix(data = c(unlist(MC3_bi_means[1001:2000]),
                                                 unlist(MC3_bi_ranmean_diff[1001:2000]),
                                                 unlist(MC3_bi_ranmean_var[1001:2000]),
                                                 unlist(MC3_bi_ranmean_CPC[1001:2000])),
                                        nrow = 1000))
colnames(MC3_bi_ranmean2) <- c("ranmean", "difference", "variance", "CPC")

MC3_bi_ranmean3 <- as.data.frame(matrix(data = c(unlist(MC3_bi_means[2001:3000]),
                                                 unlist(MC3_bi_ranmean_diff[2001:3000]),
                                                 unlist(MC3_bi_ranmean_var[2001:3000]),
                                                 unlist(MC3_bi_ranmean_CPC[2001:3000])),
                                        nrow = 1000))
colnames(MC3_bi_ranmean3) <- c("ranmean", "difference", "variance", "CPC")

MC3_bi_ranmean4 <- as.data.frame(matrix(data = c(unlist(MC3_bi_means[3001:4000]),
                                                 unlist(MC3_bi_ranmean_diff[3001:4000]),
                                                 unlist(MC3_bi_ranmean_var[3001:4000]),
                                                 unlist(MC3_bi_ranmean_CPC[3001:4000])),
                                        nrow = 1000))
colnames(MC3_bi_ranmean4) <- c("ranmean", "difference", "variance", "CPC")

#merge data sets together for plotting

MC3_bi_ranvar_all <- rbind(MC3_bi_ranvar1, MC3_bi_ranvar2, MC3_bi_ranvar3, MC3_bi_ranvar4)
MC3_bi_ranmean_all <- rbind(MC3_bi_ranmean1, MC3_bi_ranmean2, MC3_bi_ranmean3, MC3_bi_ranmean4)

#add means and standard deviation labels and scale measures

ranvar_labels <- c(rep(c("means = (-5, 5)", "means = (-4, 4)", "means = (-3, 3)", "means = (-2, 2)"),
                       each = 1000))
ranmean_labels <- c(rep(c("sd = 0.5", "sd = 1", "sd = 1.5", "sd = 2"), each = 1000))

ranvar_labels <- factor(ranvar_labels, levels = c("means = (-2, 2)", "means = (-3, 3)",
                                                  "means = (-4, 4)", "means = (-5, 5)"))
ranmean_labels <- factor(ranmean_labels, levels = c("sd = 2", "sd = 1.5",
                                                    "sd = 1", "sd = 0.5"))

levels(ranvar_labels) <- c("means = (-2, 2)" = TeX("$\\mu = \\{-2, 2\\}$"),
                           "means = (-3, 3)" = TeX("$\\mu = \\{-3, 3\\}$"),
                           "means = (-4, 4)" = TeX("$\\mu = \\{-4, 4\\}$"),
                           "means = (-5, 5)" = TeX("$\\mu = \\{-5, 5\\}$"))
levels(ranmean_labels) <- c("sd = 2" = TeX("$\\sigma = 2$"),
                            "sd = 1.5" = TeX("$\\sigma = 1.5$"),
                            "sd = 1" = TeX("$\\sigma = 1$"),
                            "sd = 0.5" = TeX("$\\sigma = 0.5$"))

MC3_bi_ranvar_all <- cbind(MC3_bi_ranvar_all, label = ranvar_labels)
MC3_bi_ranmean_all <- cbind(MC3_bi_ranmean_all, label = ranmean_labels)

MC3_bi_ranvar_all <- mutate(MC3_bi_ranvar_all,
                            difference = scales::rescale(difference, to = c(0, 1)),
                            variance = scales::rescale(variance, to = c(0, 1)),
                            CPC = scales::rescale(CPC, to = c(0, 1))) %>%
  pivot_longer(cols = 2:4, names_to = "Measure", values_to = "value") %>%
  mutate(Measure = ifelse(Measure == "difference", "Difference",
                          ifelse(Measure == "variance", "Variance", Measure)))

MC3_bi_ranmean_all <- mutate(MC3_bi_ranmean_all, ranmean = ranmean*2,
                             difference = scales::rescale(difference, to = c(0, 1)),
                             variance = scales::rescale(variance, to = c(0, 1)),
                             CPC = scales::rescale(CPC, to = c(0, 1))) %>%
  pivot_longer(cols = 2:4, names_to = "Measure", values_to = "value") %>%
  mutate(Measure = ifelse(Measure == "difference", "Difference",
                          ifelse(Measure == "variance", "Variance", Measure)))

#plot simulation results (Figure R2)

pdf("figureR2B.pdf", width = 9, height = 7)

ggplot(MC3_bi_ranvar_all, aes(x = ranvar, y = value, linetype = Measure, color = Measure)) +
  geom_smooth(method = "loess", se = FALSE) +
  scale_linetype_manual(values = c("solid", "longdash", "dotted", "dotdash")) +
  scale_color_manual(values = c("black", "gray50", "gray50", "gray50")) +
  facet_wrap(~ label, labeller = label_parsed) +
  labs(x = "Component Standard Deviations", y = "Estimated Level of Polarization",
       title = TeX("B: Random $\\sigma$ (intragroup homogeneity)")) +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        axis.text = element_text(size = 16),
        plot.title = element_text(size = 22, hjust = -1))

dev.off()

pdf("figureR2A.pdf", width = 9, height = 7)

ggplot(MC3_bi_ranmean_all, aes(x = ranmean, y = value, linetype = Measure, color = Measure)) +
  geom_smooth(method = "loess", se = FALSE) +
  scale_linetype_manual(values = c("solid", "longdash", "dotted", "dotdash")) +
  scale_color_manual(values = c("black", "gray50", "gray50", "gray50")) +
  facet_wrap(~ label, labeller = label_parsed) +
  labs(x = "Distance Between Component Means", y = "Estimated Level of Polarization",
       title = TeX("A: Random $\\mu$ (intergroup heterogeneity)")) +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        axis.text = element_text(size = 16),
        plot.title = element_text(size = 22, hjust = -1))

dev.off()

#calculate polarization ranks for random variance distributions

MC3_bi_ranvar1 <- mutate(MC3_bi_ranvar1, rank_true = rank(-ranvar),
                         difference = rank(difference),
                         variance = rank(variance),
                         CPC = rank(CPC),
                         mean_label = rep("means = (-5, 5)", 1000))
MC3_bi_ranvar2 <- mutate(MC3_bi_ranvar2, rank_true = rank(-ranvar),
                         difference = rank(difference),
                         variance = rank(variance),
                         CPC = rank(CPC),
                         mean_label = rep("means = (-4, 4)", 1000))
MC3_bi_ranvar3 <- mutate(MC3_bi_ranvar3, rank_true = rank(-ranvar),
                         difference = rank(difference),
                         variance = rank(variance),
                         CPC = rank(CPC),
                         mean_label = rep("means = (-3, 3)", 1000))
MC3_bi_ranvar4 <- mutate(MC3_bi_ranvar4, rank_true = rank(-ranvar),
                         difference = rank(difference),
                         variance = rank(variance),
                         CPC = rank(CPC),
                         mean_label = rep("means = (-2, 2)", 1000))

#calculate polarization ranks for random mean distributions

MC3_bi_ranmean1 <- mutate(MC3_bi_ranmean1, rank_true = rank(ranmean),
                          difference = rank(difference),
                          variance = rank(variance),
                          CPC = rank(CPC),
                          variance_label = rep("sd = 0.5", 1000))
MC3_bi_ranmean2 <- mutate(MC3_bi_ranmean2, rank_true = rank(ranmean),
                          difference = rank(difference),
                          variance = rank(variance),
                          CPC = rank(CPC),
                          variance_label = rep("sd = 1", 1000))
MC3_bi_ranmean3 <- mutate(MC3_bi_ranmean3, rank_true = rank(ranmean),
                          difference = rank(difference),
                          variance = rank(variance),
                          CPC = rank(CPC),
                          variance_label = rep("sd = 1.5", 1000))
MC3_bi_ranmean4 <- mutate(MC3_bi_ranmean4, rank_true = rank(ranmean),
                          difference = rank(difference),
                          variance = rank(variance),
                          CPC = rank(CPC),
                          variance_label = rep("sd = 2", 1000))

#combine polarization rank data into single data set

MC3_bi_ranvar_plot <- rbind(MC3_bi_ranvar1, MC3_bi_ranvar2, MC3_bi_ranvar3, MC3_bi_ranvar4)
MC3_bi_ranmean_plot <- rbind(MC3_bi_ranmean1, MC3_bi_ranmean2, MC3_bi_ranmean3, MC3_bi_ranmean4)

#tidy data and create labels for error calculation

MC3_bi_ranvar_plot <- pivot_longer(MC3_bi_ranvar_plot, cols = 2:4,
                                   names_to = "measure", values_to = "value") %>%
  mutate(measure = ifelse(measure == "difference", "Difference",
                          ifelse(measure == "variance", "Variance", measure)))
MC3_bi_ranmean_plot <- pivot_longer(MC3_bi_ranmean_plot, cols = 2:4,
                                    names_to = "measure", values_to = "value") %>%
  mutate(measure = ifelse(measure == "difference", "Difference",
                          ifelse(measure == "variance", "Variance", measure)))

MC3_bi_ranvar_plot$mean_label <- factor(MC3_bi_ranvar_plot$mean_label,
                                        levels = c("means = (-2, 2)", "means = (-3, 3)",
                                                   "means = (-4, 4)", "means = (-5, 5)"))
MC3_bi_ranmean_plot$variance_label <- factor(MC3_bi_ranmean_plot$variance_label,
                                             levels = c("sd = 2", "sd = 1.5", "sd = 1", "sd = 0.5"))
MC3_bi_ranvar_plot$measure <- factor(MC3_bi_ranvar_plot$measure,
                                     levels = c("Difference", "Variance", "CPC"))
MC3_bi_ranmean_plot$measure <- factor(MC3_bi_ranmean_plot$measure,
                                      levels = c("Difference", "Variance", "CPC"))

###################################################################################
#Simulations for univariate distributions (4 clusters)
###################################################################################

#set seed

set.seed(123)

#generate Gaussian mixture distributions

MC4_uni_ranvar <- list()
MC4_uni_ranmean <- list()
MC4_uni_variances <- c()
MC4_uni_means <- c()

for (i in means_values) {
  for (j in 1:1000) {
    variance <- runif(1, min = 0.5, max = 2)
    data <- matrix(c(rnorm(250, -i*2, variance), rnorm(250, -i/2, variance),
                     rnorm(250, i/2, variance), rnorm(250, i*2, variance)), ncol = 1)
    cluster <- matrix(c(rep(1, 250), rep(2, 250), rep(3, 250), rep(4, 250)), ncol = 1)
    data <- cbind(data, cluster)
    MC4_uni_ranvar[[length(MC4_uni_ranvar) + 1]] <- data
    MC4_uni_variances[[length(MC4_uni_variances) + 1]] <- variance
  }
  base::print(i)
}

for (i in variance_values) {
  for (j in 1:1000) {
    mean <- runif(1, min = 2, max = 5)
    data <- matrix(c(rnorm(250, -mean*2, i), rnorm(250, -mean/2, i),
                     rnorm(250, mean/2, i), rnorm(250, mean*2, i)), ncol = 1)
    cluster <- matrix(c(rep(1, 250), rep(2, 250), rep(3, 250), rep(4, 250)), ncol = 1)
    data <- cbind(data, cluster)
    MC4_uni_ranmean[[length(MC4_uni_ranmean) + 1]] <- data
    MC4_uni_means[[length(MC4_uni_means) + 1]] <- mean
  }
  base::print(i)
}

#calculate various polarization scores for simulated distributions

MC4_uni_ranvar_diff <- pblapply(MC4_uni_ranvar,
                                function(x) diff_multidim(x, cols = 1, clusters = 2), cl = cores)
MC4_uni_ranvar_var <- pblapply(MC4_uni_ranvar, function(x) var(x[,1]), cl = cores)
MC4_uni_ranvar_CPC <- pblapply(MC4_uni_ranvar,
                               function(x) CPC(x, "manual", cols = 1, clusters = 2, adjust = TRUE),
                               cl = cores)

MC4_uni_ranmean_diff <- pblapply(MC4_uni_ranmean,
                                 function(x) diff_multidim(x, cols = 1, clusters = 2), cl = cores)
MC4_uni_ranmean_var <- pblapply(MC4_uni_ranmean, function(x) var(x[,1]), cl = cores)
MC4_uni_ranmean_CPC <- pblapply(MC4_uni_ranmean,
                                function(x) CPC(x, "manual", cols = 1, clusters = 2, adjust = TRUE),
                                cl = cores)

#combine random variance polarization results into data frames, holding means constant within data frame for accurate rank calculations

MC4_uni_ranvar1 <- as.data.frame(matrix(data = c(unlist(MC4_uni_variances[1:1000]),
                                                 unlist(MC4_uni_ranvar_diff[1:1000]),
                                                 unlist(MC4_uni_ranvar_var[1:1000]),
                                                 unlist(MC4_uni_ranvar_CPC[1:1000])),
                                        nrow = 1000))
colnames(MC4_uni_ranvar1) <- c("ranvar", "difference", "variance", "CPC")

MC4_uni_ranvar2 <- as.data.frame(matrix(data = c(unlist(MC4_uni_variances[1001:2000]),
                                                 unlist(MC4_uni_ranvar_diff[1001:2000]),
                                                 unlist(MC4_uni_ranvar_var[1001:2000]),
                                                 unlist(MC4_uni_ranvar_CPC[1001:2000])),
                                        nrow = 1000))
colnames(MC4_uni_ranvar2) <- c("ranvar", "difference", "variance", "CPC")

MC4_uni_ranvar3 <- as.data.frame(matrix(data = c(unlist(MC4_uni_variances[2001:3000]),
                                                 unlist(MC4_uni_ranvar_diff[2001:3000]),
                                                 unlist(MC4_uni_ranvar_var[2001:3000]),
                                                 unlist(MC4_uni_ranvar_CPC[2001:3000])),
                                        nrow = 1000))
colnames(MC4_uni_ranvar3) <- c("ranvar", "difference", "variance", "CPC")

MC4_uni_ranvar4 <- as.data.frame(matrix(data = c(unlist(MC4_uni_variances[3001:4000]),
                                                 unlist(MC4_uni_ranvar_diff[3001:4000]),
                                                 unlist(MC4_uni_ranvar_var[3001:4000]),
                                                 unlist(MC4_uni_ranvar_CPC[3001:4000])),
                                        nrow = 1000))
colnames(MC4_uni_ranvar4) <- c("ranvar", "difference", "variance", "CPC")

#combine random mean polarization results into data frames, holding variance constant within data frame for accurate rank calculations

MC4_uni_ranmean1 <- as.data.frame(matrix(data = c(unlist(MC4_uni_means[1:1000]),
                                                  unlist(MC4_uni_ranmean_diff[1:1000]),
                                                  unlist(MC4_uni_ranmean_var[1:1000]),
                                                  unlist(MC4_uni_ranmean_CPC[1:1000])),
                                         nrow = 1000))
colnames(MC4_uni_ranmean1) <- c("ranmean", "difference", "variance", "CPC")

MC4_uni_ranmean2 <- as.data.frame(matrix(data = c(unlist(MC4_uni_means[1001:2000]),
                                                  unlist(MC4_uni_ranmean_diff[1001:2000]),
                                                  unlist(MC4_uni_ranmean_var[1001:2000]),
                                                  unlist(MC4_uni_ranmean_CPC[1001:2000])),
                                         nrow = 1000))
colnames(MC4_uni_ranmean2) <- c("ranmean", "difference", "variance", "CPC")

MC4_uni_ranmean3 <- as.data.frame(matrix(data = c(unlist(MC4_uni_means[2001:3000]),
                                                  unlist(MC4_uni_ranmean_diff[2001:3000]),
                                                  unlist(MC4_uni_ranmean_var[2001:3000]),
                                                  unlist(MC4_uni_ranmean_CPC[2001:3000])),
                                         nrow = 1000))
colnames(MC4_uni_ranmean3) <- c("ranmean", "difference", "variance", "CPC")

MC4_uni_ranmean4 <- as.data.frame(matrix(data = c(unlist(MC4_uni_means[3001:4000]),
                                                  unlist(MC4_uni_ranmean_diff[3001:4000]),
                                                  unlist(MC4_uni_ranmean_var[3001:4000]),
                                                  unlist(MC4_uni_ranmean_CPC[3001:4000])),
                                         nrow = 1000))
colnames(MC4_uni_ranmean4) <- c("ranmean", "difference", "variance", "CPC")

#merge data sets together for plotting

MC4_uni_ranvar_all <- rbind(MC4_uni_ranvar1, MC4_uni_ranvar2, MC4_uni_ranvar3, MC4_uni_ranvar4)
MC4_uni_ranmean_all <- rbind(MC4_uni_ranmean1, MC4_uni_ranmean2, MC4_uni_ranmean3, MC4_uni_ranmean4)

#add means and standard deviation labels and scale measures

ranvar_labels <- c(rep(c("means = (-5, 5)", "means = (-4, 4)", "means = (-3, 3)", "means = (-2, 2)"),
                       each = 1000))
ranmean_labels <- c(rep(c("sd = 0.5", "sd = 1", "sd = 1.5", "sd = 2"), each = 1000))

ranvar_labels <- factor(ranvar_labels, levels = c("means = (-2, 2)", "means = (-3, 3)",
                                                  "means = (-4, 4)", "means = (-5, 5)"))
ranmean_labels <- factor(ranmean_labels, levels = c("sd = 2", "sd = 1.5",
                                                    "sd = 1", "sd = 0.5"))

levels(ranvar_labels) <- c("means = (-2, 2)" = TeX("$\\mu = \\{-2, 2\\}$"),
                           "means = (-3, 3)" = TeX("$\\mu = \\{-3, 3\\}$"),
                           "means = (-4, 4)" = TeX("$\\mu = \\{-4, 4\\}$"),
                           "means = (-5, 5)" = TeX("$\\mu = \\{-5, 5\\}$"))
levels(ranmean_labels) <- c("sd = 2" = TeX("$\\sigma = 2$"),
                            "sd = 1.5" = TeX("$\\sigma = 1.5$"),
                            "sd = 1" = TeX("$\\sigma = 1$"),
                            "sd = 0.5" = TeX("$\\sigma = 0.5$"))

MC4_uni_ranvar_all <- cbind(MC4_uni_ranvar_all, label = ranvar_labels)
MC4_uni_ranmean_all <- cbind(MC4_uni_ranmean_all, label = ranmean_labels)

MC4_uni_ranvar_all <- mutate(MC4_uni_ranvar_all,
                             difference = scales::rescale(difference, to = c(0, 1)),
                             variance = scales::rescale(variance, to = c(0, 1)),
                             CPC = scales::rescale(CPC, to = c(0, 1))) %>%
  pivot_longer(cols = 2:4, names_to = "Measure", values_to = "value") %>%
  mutate(Measure = ifelse(Measure == "difference", "Difference",
                          ifelse(Measure == "variance", "Variance", Measure)))

MC4_uni_ranmean_all <- mutate(MC4_uni_ranmean_all, ranmean = ranmean*2,
                              difference = scales::rescale(difference, to = c(0, 1)),
                              variance = scales::rescale(variance, to = c(0, 1)),
                              CPC = scales::rescale(CPC, to = c(0, 1))) %>%
  pivot_longer(cols = 2:4, names_to = "Measure", values_to = "value") %>%
  mutate(Measure = ifelse(Measure == "difference", "Difference",
                          ifelse(Measure == "variance", "Variance", Measure)))

#plot simulation results (Figure R3)

pdf("figureR3B.pdf", width = 9, height = 7)

ggplot(MC4_uni_ranvar_all, aes(x = ranvar, y = value, linetype = Measure, color = Measure)) +
  geom_smooth(method = "loess", se = FALSE) +
  scale_linetype_manual(values = c("solid", "longdash", "dotted", "dotdash")) +
  scale_color_manual(values = c("black", "gray50", "gray50", "gray50")) +
  facet_wrap(~ label, labeller = label_parsed) +
  labs(x = "Component Standard Deviations", y = "Estimated Level of Polarization",
       title = TeX("B: Random $\\sigma$ (intragroup homogeneity)")) +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        axis.text = element_text(size = 16),
        plot.title = element_text(size = 22, hjust = -1))

dev.off()

pdf("figureR3A.pdf", width = 9, height = 7)

ggplot(MC4_uni_ranmean_all, aes(x = ranmean, y = value, linetype = Measure, color = Measure)) +
  geom_smooth(method = "loess", se = FALSE) +
  scale_linetype_manual(values = c("solid", "longdash", "dotted", "dotdash")) +
  scale_color_manual(values = c("black", "gray50", "gray50", "gray50")) +
  facet_wrap(~ label, labeller = label_parsed) +
  labs(x = "Distance Between Component Means", y = "Estimated Level of Polarization",
       title = TeX("A: Random $\\mu$ (intergroup heterogeneity)")) +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        axis.text = element_text(size = 16),
        plot.title = element_text(size = 22, hjust = -1))

dev.off()

#calculate polarization ranks for random variance distributions

MC4_uni_ranvar1 <- mutate(MC4_uni_ranvar1, rank_true = rank(-ranvar),
                          difference = rank(difference),
                          variance = rank(variance),
                          CPC = rank(CPC),
                          mean_label = rep("means = (-5, 5)", 1000))
MC4_uni_ranvar2 <- mutate(MC4_uni_ranvar2, rank_true = rank(-ranvar),
                          difference = rank(difference),
                          variance = rank(variance),
                          CPC = rank(CPC),
                          mean_label = rep("means = (-4, 4)", 1000))
MC4_uni_ranvar3 <- mutate(MC4_uni_ranvar3, rank_true = rank(-ranvar),
                          difference = rank(difference),
                          variance = rank(variance),
                          CPC = rank(CPC),
                          mean_label = rep("means = (-3, 3)", 1000))
MC4_uni_ranvar4 <- mutate(MC4_uni_ranvar4, rank_true = rank(-ranvar),
                          difference = rank(difference),
                          variance = rank(variance),
                          CPC = rank(CPC),
                          mean_label = rep("means = (-2, 2)", 1000))

#calculate polarization ranks for random mean distributions

MC4_uni_ranmean1 <- mutate(MC4_uni_ranmean1, rank_true = rank(ranmean),
                           difference = rank(difference),
                           variance = rank(variance),
                           CPC = rank(CPC),
                           variance_label = rep("sd = 0.5", 1000))
MC4_uni_ranmean2 <- mutate(MC4_uni_ranmean2, rank_true = rank(ranmean),
                           difference = rank(difference),
                           variance = rank(variance),
                           CPC = rank(CPC),
                           variance_label = rep("sd = 1", 1000))
MC4_uni_ranmean3 <- mutate(MC4_uni_ranmean3, rank_true = rank(ranmean),
                           difference = rank(difference),
                           variance = rank(variance),
                           CPC = rank(CPC),
                           variance_label = rep("sd = 1.5", 1000))
MC4_uni_ranmean4 <- mutate(MC4_uni_ranmean4, rank_true = rank(ranmean),
                           difference = rank(difference),
                           variance = rank(variance),
                           CPC = rank(CPC),
                           variance_label = rep("sd = 2", 1000))

#combine polarization rank data into single data set

MC4_uni_ranvar_plot <- rbind(MC4_uni_ranvar1, MC4_uni_ranvar2, MC4_uni_ranvar3, MC4_uni_ranvar4)
MC4_uni_ranmean_plot <- rbind(MC4_uni_ranmean1, MC4_uni_ranmean2, MC4_uni_ranmean3, MC4_uni_ranmean4)

#tidy data and create labels for error calculation

MC4_uni_ranvar_plot <- pivot_longer(MC4_uni_ranvar_plot, cols = 2:4,
                                    names_to = "measure", values_to = "value") %>%
  mutate(measure = ifelse(measure == "difference", "Difference",
                          ifelse(measure == "variance", "Variance", measure)))
MC4_uni_ranmean_plot <- pivot_longer(MC4_uni_ranmean_plot, cols = 2:4,
                                     names_to = "measure", values_to = "value") %>%
  mutate(measure = ifelse(measure == "difference", "Difference",
                          ifelse(measure == "variance", "Variance", measure)))

MC4_uni_ranvar_plot$mean_label <- factor(MC4_uni_ranvar_plot$mean_label,
                                         levels = c("means = (-2, 2)", "means = (-3, 3)",
                                                    "means = (-4, 4)", "means = (-5, 5)"))
MC4_uni_ranmean_plot$variance_label <- factor(MC4_uni_ranmean_plot$variance_label,
                                              levels = c("sd = 2", "sd = 1.5", "sd = 1", "sd = 0.5"))

MC4_uni_ranvar_plot$measure <- factor(MC4_uni_ranvar_plot$measure,
                                      levels = c("Difference", "Variance", "CPC"))
MC4_uni_ranmean_plot$measure <- factor(MC4_uni_ranmean_plot$measure,
                                       levels = c("Difference", "Variance", "CPC"))

###################################################################################
#Simulations for bivariate distributions (4 clusters)
###################################################################################

#generate Gaussian mixture distributions

MC4_bi_ranvar <- list()
MC4_bi_ranmean <- list()
MC4_bi_variances <- c()
MC4_bi_means <- c()

for (i in means_values) {
  for (j in 1:1000) {
    variance <- runif(1, min = 0.5, max = 2)
    data <- as.matrix(rbind(mvrnorm(250, c(-i, i),
                                    matrix(c(variance^2, 0, 0, variance^2), ncol = 2, byrow = TRUE)),
                            mvrnorm(250, c(i, i),
                                    matrix(c(variance^2, 0, 0, variance^2), ncol = 2, byrow = TRUE)),
                            mvrnorm(250, c(i, -i),
                                    matrix(c(variance^2, 0, 0, variance^2), ncol = 2, byrow = TRUE)),
                            mvrnorm(250, c(-i, -i),
                                    matrix(c(variance^2, 0, 0, variance^2), ncol = 2, byrow = TRUE))))
    cluster <- matrix(c(rep(1, 250), rep(2, 250), rep(3, 250), rep(4, 250)),
                      ncol = 1)
    data <- cbind(data, cluster)
    MC4_bi_ranvar[[length(MC4_bi_ranvar) + 1]] <- data
    MC4_bi_variances[[length(MC4_bi_variances) + 1]] <- variance
  }
  base::print(i)
}

for (i in variance_values) {
  for (j in 1:1000) {
    mean <- runif(1, min = 2, max = 5)
    data <- as.matrix(rbind(mvrnorm(250, c(-mean, mean),
                                    matrix(c(i^2, 0, 0, i^2), ncol = 2, byrow = TRUE)),
                            mvrnorm(250, c(mean, mean),
                                    matrix(c(i^2, 0, 0, i^2), ncol = 2, byrow = TRUE)),
                            mvrnorm(250, c(mean, -mean),
                                    matrix(c(i^2, 0, 0, i^2), ncol = 2, byrow = TRUE)),
                            mvrnorm(250, c(-mean, -mean),
                                    matrix(c(i^2, 0, 0, i^2), ncol = 2, byrow = TRUE))))
    cluster <- matrix(c(rep(1, 250), rep(2, 250), rep(3, 250), rep(4, 250)), ncol = 1)
    data <- cbind(data, cluster)
    MC4_bi_ranmean[[length(MC4_bi_ranmean)+1]] <- data
    MC4_bi_means[[length(MC4_bi_means)+1]] <- mean
  }
  base::print(i)
}

#calculate various polarization scores for simulated distributions

MC4_bi_ranvar_diff <- pblapply(MC4_bi_ranvar,
                               function(x) diff_multidim(data = x, cols = 1:2, clusters = 3),
                               cl = cores)
MC4_bi_ranvar_var <- pblapply(MC4_bi_ranvar, function(x) tr(cov(x[,1:2], use = "complete.obs")),
                              cl = cores)
MC4_bi_ranvar_CPC <- pblapply(MC4_bi_ranvar,
                              function(x) CPC(x, "manual", cols = 1:2, clusters = 3, adjust = TRUE),
                              cl = cores)

MC4_bi_ranmean_diff <- pblapply(MC4_bi_ranmean,
                                function(x) diff_multidim(data = x, cols = 1:2, clusters = 3),
                                cl = cores)
MC4_bi_ranmean_var <- pblapply(MC4_bi_ranmean, function(x) tr(cov(x[,1:2], use = "complete.obs")),
                               cl = cores)
MC4_bi_ranmean_CPC <- pblapply(MC4_bi_ranmean,
                               function(x) CPC(x, "manual", cols = 1:2, clusters = 3, adjust = TRUE),
                               cl = cores)

#combine random variance polarization results into data frames, holding means constant within data frame for accurate rank calculations

MC4_bi_ranvar1 <- as.data.frame(matrix(data = c(unlist(MC4_bi_variances[1:1000]),
                                                unlist(MC4_bi_ranvar_diff[1:1000]),
                                                unlist(MC4_bi_ranvar_var[1:1000]),
                                                unlist(MC4_bi_ranvar_CPC[1:1000])),
                                       nrow = 1000))
colnames(MC4_bi_ranvar1) <- c("ranvar", "difference", "variance", "CPC")

MC4_bi_ranvar2 <- as.data.frame(matrix(data = c(unlist(MC4_bi_variances[1001:2000]),
                                                unlist(MC4_bi_ranvar_diff[1001:2000]),
                                                unlist(MC4_bi_ranvar_var[1001:2000]),
                                                unlist(MC4_bi_ranvar_CPC[1001:2000])),
                                       nrow = 1000))
colnames(MC4_bi_ranvar2) <- c("ranvar", "difference", "variance", "CPC")

MC4_bi_ranvar3 <- as.data.frame(matrix(data = c(unlist(MC4_bi_variances[2001:3000]),
                                                unlist(MC4_bi_ranvar_diff[2001:3000]),
                                                unlist(MC4_bi_ranvar_var[2001:3000]),
                                                unlist(MC4_bi_ranvar_CPC[2001:3000])),
                                       nrow = 1000))
colnames(MC4_bi_ranvar3) <- c("ranvar", "difference", "variance", "CPC")

MC4_bi_ranvar4 <- as.data.frame(matrix(data = c(unlist(MC4_bi_variances[3001:4000]),
                                                unlist(MC4_bi_ranvar_diff[3001:4000]),
                                                unlist(MC4_bi_ranvar_var[3001:4000]),
                                                unlist(MC4_bi_ranvar_CPC[3001:4000])),
                                       nrow = 1000))
colnames(MC4_bi_ranvar4) <- c("ranvar", "difference", "variance", "CPC")

#combine random mean polarization results into data frames, holding variance constant within data frame for accurate rank calculations

MC4_bi_ranmean1 <- as.data.frame(matrix(data = c(unlist(MC4_bi_means[1:1000]),
                                                 unlist(MC4_bi_ranmean_diff[1:1000]),
                                                 unlist(MC4_bi_ranmean_var[1:1000]),
                                                 unlist(MC4_bi_ranmean_CPC[1:1000])),
                                        nrow = 1000))
colnames(MC4_bi_ranmean1) <- c("ranmean", "difference", "variance", "CPC")

MC4_bi_ranmean2 <- as.data.frame(matrix(data = c(unlist(MC4_bi_means[1001:2000]),
                                                 unlist(MC4_bi_ranmean_diff[1001:2000]),
                                                 unlist(MC4_bi_ranmean_var[1001:2000]),
                                                 unlist(MC4_bi_ranmean_CPC[1001:2000])),
                                        nrow = 1000))
colnames(MC4_bi_ranmean2) <- c("ranmean", "difference", "variance", "CPC")

MC4_bi_ranmean3 <- as.data.frame(matrix(data = c(unlist(MC4_bi_means[2001:3000]),
                                                 unlist(MC4_bi_ranmean_diff[2001:3000]),
                                                 unlist(MC4_bi_ranmean_var[2001:3000]),
                                                 unlist(MC4_bi_ranmean_CPC[2001:3000])),
                                        nrow = 1000))
colnames(MC4_bi_ranmean3) <- c("ranmean", "difference", "variance", "CPC")

MC4_bi_ranmean4 <- as.data.frame(matrix(data = c(unlist(MC4_bi_means[3001:4000]),
                                                 unlist(MC4_bi_ranmean_diff[3001:4000]),
                                                 unlist(MC4_bi_ranmean_var[3001:4000]),
                                                 unlist(MC4_bi_ranmean_CPC[3001:4000])),
                                        nrow = 1000))
colnames(MC4_bi_ranmean4) <- c("ranmean", "difference", "variance", "CPC")

#merge data sets together for plotting

MC4_bi_ranvar_all <- rbind(MC4_bi_ranvar1, MC4_bi_ranvar2, MC4_bi_ranvar3, MC4_bi_ranvar4)
MC4_bi_ranmean_all <- rbind(MC4_bi_ranmean1, MC4_bi_ranmean2, MC4_bi_ranmean3, MC4_bi_ranmean4)

#add means and standard deviation labels and scale measures

ranvar_labels <- c(rep(c("means = (-5, 5)", "means = (-4, 4)", "means = (-3, 3)", "means = (-2, 2)"),
                       each = 1000))
ranmean_labels <- c(rep(c("sd = 0.5", "sd = 1", "sd = 1.5", "sd = 2"), each = 1000))

ranvar_labels <- factor(ranvar_labels, levels = c("means = (-2, 2)", "means = (-3, 3)",
                                                  "means = (-4, 4)", "means = (-5, 5)"))
ranmean_labels <- factor(ranmean_labels, levels = c("sd = 2", "sd = 1.5",
                                                    "sd = 1", "sd = 0.5"))

levels(ranvar_labels) <- c("means = (-2, 2)" = TeX("$\\mu = \\{-2, 2\\}$"),
                           "means = (-3, 3)" = TeX("$\\mu = \\{-3, 3\\}$"),
                           "means = (-4, 4)" = TeX("$\\mu = \\{-4, 4\\}$"),
                           "means = (-5, 5)" = TeX("$\\mu = \\{-5, 5\\}$"))
levels(ranmean_labels) <- c("sd = 2" = TeX("$\\sigma = 2$"),
                            "sd = 1.5" = TeX("$\\sigma = 1.5$"),
                            "sd = 1" = TeX("$\\sigma = 1$"),
                            "sd = 0.5" = TeX("$\\sigma = 0.5$"))

MC4_bi_ranvar_all <- cbind(MC4_bi_ranvar_all, label = ranvar_labels)
MC4_bi_ranmean_all <- cbind(MC4_bi_ranmean_all, label = ranmean_labels)

MC4_bi_ranvar_all <- mutate(MC4_bi_ranvar_all,
                            difference = scales::rescale(difference, to = c(0, 1)),
                            variance = scales::rescale(variance, to = c(0, 1)),
                            CPC = scales::rescale(CPC, to = c(0, 1))) %>%
  pivot_longer(cols = 2:4, names_to = "Measure", values_to = "value") %>%
  mutate(Measure = ifelse(Measure == "difference", "Difference",
                          ifelse(Measure == "variance", "Variance", Measure)))

MC4_bi_ranmean_all <- mutate(MC4_bi_ranmean_all, ranmean = ranmean*2,
                             difference = scales::rescale(difference, to = c(0, 1)),
                             variance = scales::rescale(variance, to = c(0, 1)),
                             CPC = scales::rescale(CPC, to = c(0, 1))) %>%
  pivot_longer(cols = 2:4, names_to = "Measure", values_to = "value") %>%
  mutate(Measure = ifelse(Measure == "difference", "Difference",
                          ifelse(Measure == "variance", "Variance", Measure)))

#plot simulation results (Figure R4)

pdf("figureR4B.pdf", width = 9, height = 7)

ggplot(MC4_bi_ranvar_all, aes(x = ranvar, y = value, linetype = Measure, color = Measure)) +
  geom_smooth(method = "loess", se = FALSE) +
  scale_linetype_manual(values = c("solid", "longdash", "dotted", "dotdash")) +
  scale_color_manual(values = c("black", "gray50", "gray50", "gray50")) +
  facet_wrap(~ label, labeller = label_parsed) +
  labs(x = "Component Standard Deviations", y = "Estimated Level of Polarization",
       title = TeX("B: Random $\\sigma$ (intragroup homogeneity)")) +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        axis.text = element_text(size = 16),
        plot.title = element_text(size = 22, hjust = -1))

dev.off()

pdf("figureR4A.pdf", width = 9, height = 7)

ggplot(MC4_bi_ranmean_all, aes(x = ranmean, y = value, linetype = Measure, color = Measure)) +
  geom_smooth(method = "loess", se = FALSE) +
  scale_linetype_manual(values = c("solid", "longdash", "dotted", "dotdash")) +
  scale_color_manual(values = c("black", "gray50", "gray50", "gray50")) +
  facet_wrap(~ label, labeller = label_parsed) +
  labs(x = "Distance Between Component Means", y = "Estimated Level of Polarization",
       title = TeX("A: Random $\\mu$ (intergroup heterogeneity)")) +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        axis.text = element_text(size = 16),
        plot.title = element_text(size = 22, hjust = -1))

dev.off()

#calculate polarization ranks for random variance distributions

MC4_bi_ranvar1 <- mutate(MC4_bi_ranvar1, rank_true = rank(-ranvar),
                         difference = rank(difference),
                         variance = rank(variance),
                         CPC = rank(CPC),
                         mean_label = rep("means = (-5, 5)", 1000))
MC4_bi_ranvar2 <- mutate(MC4_bi_ranvar2, rank_true = rank(-ranvar),
                         difference = rank(difference),
                         variance = rank(variance),
                         CPC = rank(CPC),
                         mean_label = rep("means = (-4, 4)", 1000))
MC4_bi_ranvar3 <- mutate(MC4_bi_ranvar3, rank_true = rank(-ranvar),
                         difference = rank(difference),
                         variance = rank(variance),
                         CPC = rank(CPC),
                         mean_label = rep("means = (-3, 3)", 1000))
MC4_bi_ranvar4 <- mutate(MC4_bi_ranvar4, rank_true = rank(-ranvar),
                         difference = rank(difference),
                         variance = rank(variance),
                         CPC = rank(CPC),
                         mean_label = rep("means = (-2, 2)", 1000))

#calculate polarization ranks for random mean distributions

MC4_bi_ranmean1 <- mutate(MC4_bi_ranmean1, rank_true = rank(ranmean),
                          difference = rank(difference),
                          variance = rank(variance),
                          CPC = rank(CPC),
                          variance_label = rep("sd = 0.5", 1000))
MC4_bi_ranmean2 <- mutate(MC4_bi_ranmean2, rank_true = rank(ranmean),
                          difference = rank(difference),
                          variance = rank(variance),
                          CPC = rank(CPC),
                          variance_label = rep("sd = 1", 1000))
MC4_bi_ranmean3 <- mutate(MC4_bi_ranmean3, rank_true = rank(ranmean),
                          difference = rank(difference),
                          variance = rank(variance),
                          CPC = rank(CPC),
                          variance_label = rep("sd = 1.5", 1000))
MC4_bi_ranmean4 <- mutate(MC4_bi_ranmean4, rank_true = rank(ranmean),
                          difference = rank(difference),
                          variance = rank(variance),
                          CPC = rank(CPC),
                          variance_label = rep("sd = 2", 1000))

#combine polarization rank data into single data set for plotting

MC4_bi_ranvar_plot <- rbind(MC4_bi_ranvar1, MC4_bi_ranvar2, MC4_bi_ranvar3, MC4_bi_ranvar4)
MC4_bi_ranmean_plot <- rbind(MC4_bi_ranmean1, MC4_bi_ranmean2, MC4_bi_ranmean3, MC4_bi_ranmean4)

#tidy data and create labels for plotting

MC4_bi_ranvar_plot <- pivot_longer(MC4_bi_ranvar_plot, cols = 2:4,
                                   names_to = "measure", values_to = "value") %>%
  mutate(measure = ifelse(measure == "difference", "Difference",
                          ifelse(measure == "variance", "Variance", measure)))
MC4_bi_ranmean_plot <- pivot_longer(MC4_bi_ranmean_plot, cols = 2:4,
                                    names_to = "measure", values_to = "value") %>%
  mutate(measure = ifelse(measure == "difference", "Difference",
                          ifelse(measure == "variance", "Variance", measure)))

MC4_bi_ranvar_plot$mean_label <- factor(MC4_bi_ranvar_plot$mean_label,
                                        levels = c("means = (-2, 2)", "means = (-3, 3)",
                                                   "means = (-4, 4)", "means = (-5, 5)"))
MC4_bi_ranmean_plot$variance_label <- factor(MC4_bi_ranmean_plot$variance_label,
                                             levels = c("sd = 2", "sd = 1.5", "sd = 1", "sd = 0.5"))
MC4_bi_ranvar_plot$measure <- factor(MC4_bi_ranvar_plot$measure,
                                     levels = c("Difference", "Variance", "CPC"))
MC4_bi_ranmean_plot$measure <- factor(MC4_bi_ranmean_plot$measure,
                                      levels = c("Difference", "Variance", "CPC"))

###################################################################################
#RMSE and Spearman's r
###################################################################################

#apply dynamic and dimension labels and combine distance and concentration data frames

MC_uni_ranmean_plot$dynamic <- "distance"
MC_uni_ranvar_plot$dynamic <- "concentration"
MC_bi_ranmean_plot$dynamic <- "distance"
MC_bi_ranvar_plot$dynamic <- "concentration"

MC3_uni_ranmean_plot$dynamic <- "distance"
MC3_uni_ranvar_plot$dynamic <- "concentration"
MC3_bi_ranmean_plot$dynamic <- "distance"
MC3_bi_ranvar_plot$dynamic <- "concentration"

MC4_uni_ranmean_plot$dynamic <- "distance"
MC4_uni_ranvar_plot$dynamic <- "concentration"
MC4_bi_ranmean_plot$dynamic <- "distance"
MC4_bi_ranvar_plot$dynamic <- "concentration"

MC_uni_both <- rbind(MC_uni_ranmean_plot[,c(2, 4:6)], MC_uni_ranvar_plot[,c(2, 4:6)])
MC_bi_both <- rbind(MC_bi_ranmean_plot[,c(2, 4:6)], MC_bi_ranvar_plot[,c(2, 4:6)])

MC3_uni_both <- rbind(MC3_uni_ranmean_plot[,c(2, 4:6)], MC3_uni_ranvar_plot[,c(2, 4:6)])
MC3_bi_both <- rbind(MC3_bi_ranmean_plot[,c(2, 4:6)], MC3_bi_ranvar_plot[,c(2, 4:6)])

MC4_uni_both <- rbind(MC4_uni_ranmean_plot[,c(2, 4:6)], MC4_uni_ranvar_plot[,c(2, 4:6)])
MC4_bi_both <- rbind(MC4_bi_ranmean_plot[,c(2, 4:6)], MC4_bi_ranvar_plot[,c(2, 4:6)])

#create separate data frames of all observations and apply dynamic labels

MC_uni_all <- rbind(MC_uni_ranmean_plot[,c(2, 4, 5)], MC_uni_ranvar_plot[,c(2, 4, 5)]) %>%
  mutate(dynamic = "all")
MC_bi_all <- rbind(MC_bi_ranmean_plot[,c(2, 4, 5)], MC_bi_ranvar_plot[,c(2, 4, 5)]) %>%
  mutate(dynamic = "all")

MC3_uni_all <- rbind(MC3_uni_ranmean_plot[,c(2, 4, 5)], MC3_uni_ranvar_plot[,c(2, 4, 5)]) %>%
  mutate(dynamic = "all")
MC3_bi_all <- rbind(MC3_bi_ranmean_plot[,c(2, 4, 5)], MC3_bi_ranvar_plot[,c(2, 4, 5)]) %>%
  mutate(dynamic = "all")

MC4_uni_all <- rbind(MC4_uni_ranmean_plot[,c(2, 4, 5)], MC4_uni_ranvar_plot[,c(2, 4, 5)]) %>%
  mutate(dynamic = "all")
MC4_bi_all <- rbind(MC4_bi_ranmean_plot[,c(2, 4, 5)], MC4_bi_ranvar_plot[,c(2, 4, 5)]) %>%
  mutate(dynamic = "all")

#create unified data sets for both univariate and bivariate distributions and apply dimension labels

MC_uni <- rbind(MC_uni_both, MC_uni_all) %>%
  mutate(dimensions = "univariate")
MC_bi <- rbind(MC_bi_both, MC_bi_all) %>%
  mutate(dimensions = "bivariate")

MC3_uni <- rbind(MC3_uni_both, MC3_uni_all) %>%
  mutate(dimensions = "univariate")
MC3_bi <- rbind(MC3_bi_both, MC3_bi_all) %>%
  mutate(dimensions = "bivariate")

MC4_uni <- rbind(MC4_uni_both, MC4_uni_all) %>%
  mutate(dimensions = "univariate")
MC4_bi <- rbind(MC4_bi_both, MC4_bi_all) %>%
  mutate(dimensions = "bivariate")

#create unified data set of predictions

MC_all <- rbind(MC_uni, MC_bi)
MC3_all <- rbind(MC3_uni, MC3_bi)
MC4_all <- rbind(MC4_uni, MC4_bi)

#calculate RMSE and MAE for univariate and bivariate distance, concentration, and total

MC_metrics <- group_by(MC_all, dimensions, dynamic, measure) %>%
  dplyr::summarize(RMSE = round(Metrics::rmse(rank_true, value), 3),
                   Spearman = round(cor(rank_true, value, method = "spearman"), 3))

MC3_metrics <- group_by(MC3_all, dimensions, dynamic, measure) %>%
  dplyr::summarize(RMSE = round(Metrics::rmse(rank_true, value), 3),
                   Spearman = round(cor(rank_true, value, method = "spearman"), 3))

MC4_metrics <- group_by(MC4_all, dimensions, dynamic, measure) %>%
  dplyr::summarize(RMSE = round(Metrics::rmse(rank_true, value), 3),
                   Spearman = round(cor(rank_true, value, method = "spearman"), 3))

#print metrics (Table S3)

print(as.data.frame(MC_metrics))
print(as.data.frame(MC3_metrics))
print(as.data.frame(MC4_metrics))

###################################################################################
#Simulations with unequal component weights
###################################################################################

#set seed

set.seed(123)

#generate Gaussian mixture distributions with middling level of polarization, varying mixture weights

MC_uni_ranvar <- list()
MC_uni_ranmean <- list()
MC_uni_ranvar_weights <- c()
MC_uni_ranmean_weights <- c()

for (i in means_values) {
  for (j in 1:1000) {
    weight <- round(runif(1, min = 0, max = 1), 3)
    weight2 <- 1 - weight
    weight_diff <- abs(weight - weight2)
    data <- matrix(c(rnorm(1000*weight, -i, 1), rnorm(1000*weight2, i, 1)), ncol = 1)
    cluster <- matrix(c(rep(1, 1000*weight), rep(2, 1000*weight2)), ncol = 1)
    data <- cbind(data, cluster)
    MC_uni_ranvar[[length(MC_uni_ranvar) + 1]] <- data
    MC_uni_ranvar_weights[[length(MC_uni_ranvar_weights) + 1]] <- weight_diff
  }
  base::print(i)
}

for (i in variance_values) {
  for (j in 1:1000) {
    weight <- round(runif(1, min = 0, max = 1), 3)
    weight2 <- 1 - weight
    weight_diff <- abs(weight - weight2)
    data <- matrix(c(rnorm(1000*weight, -3, i), rnorm(1000*weight2, 3, i)), ncol = 1)
    cluster <- matrix(c(rep(1, 1000*weight), rep(2, 1000*weight2)), ncol = 1)
    data <- cbind(data, cluster)
    MC_uni_ranmean[[length(MC_uni_ranmean) + 1]] <- data
    MC_uni_ranmean_weights[[length(MC_uni_ranmean_weights) + 1]] <- weight_diff
  }
  base::print(i)
}

#calculate various polarization scores for simulated distributions

MC_uni_ranvar_diff <- pblapply(MC_uni_ranvar,
                               function(x) diff_multidim(x, cols = 1, clusters = 2), cl = cores)
MC_uni_ranvar_var <- pblapply(MC_uni_ranvar, function(x) var(x[,1]), cl = cores)
MC_uni_ranvar_CPC <- pblapply(MC_uni_ranvar,
                              function(x) CPC(x, "manual", cols = 1, clusters = 2, adjust = TRUE),
                              cl = cores)

MC_uni_ranmean_diff <- pblapply(MC_uni_ranmean,
                                function(x) diff_multidim(x, cols = 1, clusters = 2), cl = cores)
MC_uni_ranmean_var <- pblapply(MC_uni_ranmean, function(x) var(x[,1]), cl = cores)
MC_uni_ranmean_CPC <- pblapply(MC_uni_ranmean,
                               function(x) CPC(x, "manual", cols = 1, clusters = 2, adjust = TRUE),
                               cl = cores)

#combine random variance polarization results into data frames, holding means constant within data frame for accurate rank calculations

MC_uni_ranvar1 <- as.data.frame(matrix(data = c(unlist(MC_uni_ranvar_weights[1:1000]),
                                                unlist(MC_uni_ranvar_diff[1:1000]),
                                                unlist(MC_uni_ranvar_var[1:1000]),
                                                unlist(MC_uni_ranvar_CPC[1:1000])),
                                       nrow = 1000))
colnames(MC_uni_ranvar1) <- c("weight_diff", "difference", "variance", "CPC")

MC_uni_ranvar2 <- as.data.frame(matrix(data = c(unlist(MC_uni_ranvar_weights[1001:2000]),
                                                unlist(MC_uni_ranvar_diff[1001:2000]),
                                                unlist(MC_uni_ranvar_var[1001:2000]),
                                                unlist(MC_uni_ranvar_CPC[1001:2000])),
                                       nrow = 1000))
colnames(MC_uni_ranvar2) <- c("weight_diff", "difference", "variance", "CPC")

MC_uni_ranvar3 <- as.data.frame(matrix(data = c(unlist(MC_uni_ranvar_weights[2001:3000]),
                                                unlist(MC_uni_ranvar_diff[2001:3000]),
                                                unlist(MC_uni_ranvar_var[2001:3000]),
                                                unlist(MC_uni_ranvar_CPC[2001:3000])),
                                       nrow = 1000))
colnames(MC_uni_ranvar3) <- c("weight_diff", "difference", "variance", "CPC")

MC_uni_ranvar4 <- as.data.frame(matrix(data = c(unlist(MC_uni_ranvar_weights[3001:4000]),
                                                unlist(MC_uni_ranvar_diff[3001:4000]),
                                                unlist(MC_uni_ranvar_var[3001:4000]),
                                                unlist(MC_uni_ranvar_CPC[3001:4000])),
                                       nrow = 1000))
colnames(MC_uni_ranvar4) <- c("weight_diff", "difference" ,"variance", "CPC")

#combine random mean polarization results into data frames, holding variance constant within data frame for accurate rank calculations

MC_uni_ranmean1 <- as.data.frame(matrix(data = c(unlist(MC_uni_ranmean_weights[1:1000]),
                                                 unlist(MC_uni_ranmean_diff[1:1000]),
                                                 unlist(MC_uni_ranmean_var[1:1000]),
                                                 unlist(MC_uni_ranmean_CPC[1:1000])),
                                        nrow = 1000))
colnames(MC_uni_ranmean1) <- c("weight_diff", "difference", "variance", "CPC")

MC_uni_ranmean2 <- as.data.frame(matrix(data = c(unlist(MC_uni_ranmean_weights[1001:2000]),
                                                 unlist(MC_uni_ranmean_diff[1001:2000]),
                                                 unlist(MC_uni_ranmean_var[1001:2000]),
                                                 unlist(MC_uni_ranmean_CPC[1001:2000])),
                                        nrow = 1000))
colnames(MC_uni_ranmean2) <- c("weight_diff", "difference", "variance", "CPC")

MC_uni_ranmean3 <- as.data.frame(matrix(data = c(unlist(MC_uni_ranmean_weights[2001:3000]),
                                                 unlist(MC_uni_ranmean_diff[2001:3000]),
                                                 unlist(MC_uni_ranmean_var[2001:3000]),
                                                 unlist(MC_uni_ranmean_CPC[2001:3000])),
                                        nrow = 1000))
colnames(MC_uni_ranmean3) <- c("weight_diff", "difference", "variance", "CPC")

MC_uni_ranmean4 <- as.data.frame(matrix(data = c(unlist(MC_uni_ranmean_weights[3001:4000]),
                                                 unlist(MC_uni_ranmean_diff[3001:4000]),
                                                 unlist(MC_uni_ranmean_var[3001:4000]),
                                                 unlist(MC_uni_ranmean_CPC[3001:4000])),
                                        nrow = 1000))
colnames(MC_uni_ranmean4) <- c("weight_diff", "difference", "variance", "CPC")

#merge data sets together for plotting

MC_uni_ranvar_all <- rbind(MC_uni_ranvar1, MC_uni_ranvar2, MC_uni_ranvar3, MC_uni_ranvar4)
MC_uni_ranmean_all <- rbind(MC_uni_ranmean1, MC_uni_ranmean2, MC_uni_ranmean3, MC_uni_ranmean4)

#add means and standard deviation labels and scale measures

ranvar_labels <- c(rep(c("means = (-5, 5)", "means = (-4, 4)", "means = (-3, 3)", "means = (-2, 2)"),
                       each = 1000))
ranmean_labels <- c(rep(c("sd = 0.5", "sd = 1", "sd = 1.5", "sd = 2"), each = 1000))

ranvar_labels <- factor(ranvar_labels, levels = c("means = (-2, 2)", "means = (-3, 3)",
                                                  "means = (-4, 4)", "means = (-5, 5)"))
ranmean_labels <- factor(ranmean_labels, levels = c("sd = 2", "sd = 1.5",
                                                    "sd = 1", "sd = 0.5"))

levels(ranvar_labels) <- c("means = (-2, 2)" = TeX("$\\mu = \\{-2, 2\\}$"),
                           "means = (-3, 3)" = TeX("$\\mu = \\{-3, 3\\}$"),
                           "means = (-4, 4)" = TeX("$\\mu = \\{-4, 4\\}$"),
                           "means = (-5, 5)" = TeX("$\\mu = \\{-5, 5\\}$"))
levels(ranmean_labels) <- c("sd = 2" = TeX("$\\sigma = 2$"),
                            "sd = 1.5" = TeX("$\\sigma = 1.5$"),
                            "sd = 1" = TeX("$\\sigma = 1$"),
                            "sd = 0.5" = TeX("$\\sigma = 0.5$"))

MC_uni_ranvar_all <- cbind(MC_uni_ranvar_all, label = ranvar_labels)
MC_uni_ranmean_all <- cbind(MC_uni_ranmean_all, label = ranmean_labels)

MC_uni_ranvar_all <- mutate(MC_uni_ranvar_all,
                            difference = scales::rescale(difference, to = c(0, 1)),
                            variance = scales::rescale(variance, to = c(0, 1)),
                            CPC = scales::rescale(CPC, to = c(0, 1))) %>%
  pivot_longer(cols = 2:4, names_to = "Measure", values_to = "value") %>%
  mutate(Measure = ifelse(Measure == "difference", "Difference",
                          ifelse(Measure == "variance", "Variance", Measure)))

MC_uni_ranmean_all <- mutate(MC_uni_ranmean_all,
                             difference = scales::rescale(difference, to = c(0, 1)),
                             variance = scales::rescale(variance, to = c(0, 1)),
                             CPC = scales::rescale(CPC, to = c(0, 1))) %>%
  pivot_longer(cols = 2:4, names_to = "Measure", values_to = "value") %>%
  mutate(Measure = ifelse(Measure == "difference", "Difference",
                          ifelse(Measure == "variance", "Variance", Measure)))

MC_uni_ranvar_all$Measure <- factor(MC_uni_ranvar_all$Measure,
                                    levels = c("CPC", "Difference", "Variance"))
MC_uni_ranmean_all$Measure <- factor(MC_uni_ranmean_all$Measure,
                                     levels = c("CPC", "Difference", "Variance"))

#plot simulation results (Figure S6)

pdf("figureS6B.pdf", width = 9, height = 7)

ggplot(MC_uni_ranvar_all, aes(x = weight_diff, y = value, linetype = Measure, color = Measure)) +
  geom_smooth(method = "loess", se = FALSE) +
  scale_linetype_manual(values = c("solid", "longdash", "dotdash", "dotted")) +
  scale_color_manual(values = c("black", "gray50", "gray50", "gray50")) +
  facet_wrap(~ label, labeller = label_parsed) +
  labs(x = "Difference in Component Weights", y = "Estimated Level of Polarization",
       title = TeX("B: Random $\\sigma$ (intragroup homogeneity)")) +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        axis.text = element_text(size = 16),
        plot.title = element_text(size = 22, hjust = -1))

dev.off()

pdf("figureS6A.pdf", width = 9, height = 7)

ggplot(MC_uni_ranmean_all, aes(x = weight_diff, y = value, linetype = Measure, color = Measure)) +
  geom_smooth(method = "loess", se = FALSE) +
  scale_linetype_manual(values = c("solid", "longdash", "dotdash", "dotted")) +
  scale_color_manual(values = c("black", "gray50", "gray50", "gray50")) +
  facet_wrap(~ label, labeller = label_parsed) +
  labs(x = "Difference in Component Weights", y = "Estimated Level of Polarization",
       title = TeX("A: Random $\\mu$ (intergroup heterogeneity)")) +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        axis.text = element_text(size = 16),
        plot.title = element_text(size = 22, hjust = -1))

dev.off()

###################################################################################
#Simulations for log-normal distributions (unscaled)
###################################################################################

#set seed

set.seed(123)

#generate Gaussian mixture distributions

variance_log <- c(0.25, 0.5, 0.75)
means_log <- c(3, 2, 1)

MC_uni_ranvar <- list()
MC_uni_ranmean <- list()
MC_uni_variances <- c()
MC_uni_means <- c()

for (i in means_log) {
  for (j in 1:1000) {
    variance <- runif(1, min = 0.25, max = 0.75)
    data <- rlnorm(500, i, variance)
    data[501:1000] <- -data[1:500]
    cluster <- matrix(c(rep(1, 500), rep(2, 500)), ncol = 1)
    data <- cbind(data, cluster)
    MC_uni_ranvar[[length(MC_uni_ranvar) + 1]] <- data
    MC_uni_variances[[length(MC_uni_variances) + 1]] <- variance
  }
  base::print(i)
}

for (i in variance_log) {
  for (j in 1:1000) {
    mean <- runif(1, min = 1, max = 3)
    data <- rlnorm(500, mean, i)
    data[501:1000] <- -data[1:500]
    cluster <- matrix(c(rep(1, 500), rep(2, 500)), ncol = 1)
    data <- cbind(data, cluster)
    MC_uni_ranmean[[length(MC_uni_ranmean) + 1]] <- data
    MC_uni_means[[length(MC_uni_means) + 1]] <- mean
  }
  base::print(i)
}

#calculate various polarization scores for simulated distributions

MC_uni_ranvar_diff <- pblapply(MC_uni_ranvar,
                               function(x) diff_multidim(x, cols = 1, clusters = 2), cl = cores)
MC_uni_ranvar_var <- pblapply(MC_uni_ranvar, function(x) var(x[,1]), cl = cores)
MC_uni_ranvar_CPC <- pblapply(MC_uni_ranvar,
                              function(x) CPC(x, "manual", cols = 1, clusters = 2, adjust = TRUE),
                              cl = cores)

MC_uni_ranmean_diff <- pblapply(MC_uni_ranmean,
                                function(x) diff_multidim(x, cols = 1, clusters = 2), cl = cores)
MC_uni_ranmean_var <- pblapply(MC_uni_ranmean, function(x) var(x[,1]), cl = cores)
MC_uni_ranmean_CPC <- pblapply(MC_uni_ranmean,
                               function(x) CPC(x, "manual", cols = 1, clusters = 2, adjust = TRUE),
                               cl = cores)

#combine random variance polarization results into data frames, holding means constant within data frame for accurate rank calculations

MC_uni_ranvar1 <- as.data.frame(matrix(data = c(unlist(MC_uni_variances[1:1000]),
                                                unlist(MC_uni_ranvar_diff[1:1000]),
                                                unlist(MC_uni_ranvar_var[1:1000]),
                                                unlist(MC_uni_ranvar_CPC[1:1000])),
                                       nrow = 1000))
colnames(MC_uni_ranvar1) <- c("ranvar", "difference", "variance", "CPC")

MC_uni_ranvar2 <- as.data.frame(matrix(data = c(unlist(MC_uni_variances[1001:2000]),
                                                unlist(MC_uni_ranvar_diff[1001:2000]),
                                                unlist(MC_uni_ranvar_var[1001:2000]),
                                                unlist(MC_uni_ranvar_CPC[1001:2000])),
                                       nrow = 1000))
colnames(MC_uni_ranvar2) <- c("ranvar", "difference", "variance", "CPC")

MC_uni_ranvar3 <- as.data.frame(matrix(data = c(unlist(MC_uni_variances[2001:3000]),
                                                unlist(MC_uni_ranvar_diff[2001:3000]),
                                                unlist(MC_uni_ranvar_var[2001:3000]),
                                                unlist(MC_uni_ranvar_CPC[2001:3000])),
                                       nrow = 1000))
colnames(MC_uni_ranvar3) <- c("ranvar", "difference", "variance", "CPC")

#combine random mean polarization results into data frames, holding variance constant within data frame for accurate rank calculations

MC_uni_ranmean1 <- as.data.frame(matrix(data = c(unlist(MC_uni_means[1:1000]),
                                                 unlist(MC_uni_ranmean_diff[1:1000]),
                                                 unlist(MC_uni_ranmean_var[1:1000]),
                                                 unlist(MC_uni_ranmean_CPC[1:1000])),
                                        nrow = 1000))
colnames(MC_uni_ranmean1) <- c("ranmean", "difference", "variance", "CPC")

MC_uni_ranmean2 <- as.data.frame(matrix(data = c(unlist(MC_uni_means[1001:2000]),
                                                 unlist(MC_uni_ranmean_diff[1001:2000]),
                                                 unlist(MC_uni_ranmean_var[1001:2000]),
                                                 unlist(MC_uni_ranmean_CPC[1001:2000])),
                                        nrow = 1000))
colnames(MC_uni_ranmean2) <- c("ranmean", "difference", "variance", "CPC")

MC_uni_ranmean3 <- as.data.frame(matrix(data = c(unlist(MC_uni_means[2001:3000]),
                                                 unlist(MC_uni_ranmean_diff[2001:3000]),
                                                 unlist(MC_uni_ranmean_var[2001:3000]),
                                                 unlist(MC_uni_ranmean_CPC[2001:3000])),
                                        nrow = 1000))
colnames(MC_uni_ranmean3) <- c("ranmean", "difference", "variance", "CPC")

#merge data sets together for plotting

MC_uni_ranvar_all <- rbind(MC_uni_ranvar1, MC_uni_ranvar2, MC_uni_ranvar3)
MC_uni_ranmean_all <- rbind(MC_uni_ranmean1, MC_uni_ranmean2, MC_uni_ranmean3)

#add means and standard deviation labels and scale measures

ranvar_labels <- c(rep(c("means = (-e^3, e^3)", "means = (-e^2, e^2)", "means = (-e, e)"),
                       each = 1000))
ranmean_labels <- c(rep(c("sd = e^0.25", "sd = e^0.5", "sd = e^0.75"), each = 1000))

ranvar_labels <- factor(ranvar_labels, levels = c("means = (-e, e)", "means = (-e^2, e^2)",
                                                  "means = (-e^3, e^3)"))
ranmean_labels <- factor(ranmean_labels, levels = c("sd = e^0.25", "sd = e^0.5", "sd = e^0.75"))

levels(ranvar_labels) <- c("means = (-e, e)" = TeX("$\\mu = \\{-e, e\\}$"),
                           "means = (-e^2, e^2)" = TeX("$\\mu = \\{-e^2, e^2\\}$"),
                           "means = (-e^3, e^3)" = TeX("$\\mu = \\{-e^3, e^3\\}$"))
levels(ranmean_labels) <- c("sd = e^0.25" = TeX("$\\sigma = e^\\frac{1}{4}$"),
                            "sd = e^0.5" = TeX("$\\sigma = e^\\frac{1}{2}$"),
                            "sd = e^0.75" = TeX("$\\sigma = e^\\frac{3}{4}$"))

MC_uni_ranvar_all <- cbind(MC_uni_ranvar_all, label = ranvar_labels)
MC_uni_ranmean_all <- cbind(MC_uni_ranmean_all, label = ranmean_labels)

MC_uni_ranvar_all <- mutate(MC_uni_ranvar_all,
                            difference = scales::rescale(difference, to = c(0, 1)),
                            variance = scales::rescale(variance, to = c(0, 1)),
                            CPC = scales::rescale(CPC, to = c(0, 1))) %>%
  pivot_longer(cols = 2:4, names_to = "Measure", values_to = "value") %>%
  mutate(Measure = ifelse(Measure == "difference", "Difference",
                          ifelse(Measure == "variance", "Variance", Measure)))

MC_uni_ranmean_all <- mutate(MC_uni_ranmean_all, ranmean = ranmean*2,
                             difference = scales::rescale(difference, to = c(0, 1)),
                             variance = scales::rescale(variance, to = c(0, 1)),
                             CPC = scales::rescale(CPC, to = c(0, 1))) %>%
  pivot_longer(cols = 2:4, names_to = "Measure", values_to = "value") %>%
  mutate(Measure = ifelse(Measure == "difference", "Difference",
                          ifelse(Measure == "variance", "Variance", Measure)))

#change base size

theme_set(theme_bw(base_size = 18))

#plot simulation results (Figure S4)

pdf("figureS4B.pdf", width = 9, height = 7)

ggplot(MC_uni_ranvar_all, aes(x = ranvar, y = value, linetype = Measure, color = Measure)) +
  geom_smooth(method = "loess", se = FALSE) +
  scale_linetype_manual(values = c("solid", "longdash", "dotdash", "dotted")) +
  scale_color_manual(values = c("black", "gray50", "gray50", "gray50")) +
  facet_wrap(~ label, labeller = label_parsed) +
  labs(x = "Component Standard Deviations", y = "Estimated Level of Polarization",
       title = TeX("B: Random $\\sigma$ (intragroup homogeneity)")) +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        axis.text = element_text(size = 16),
        plot.title = element_text(size = 22, hjust = -0.9))

dev.off()

pdf("figureS4A.pdf", width = 9, height = 7)

ggplot(MC_uni_ranmean_all, aes(x = ranmean, y = value, linetype = Measure, color = Measure)) +
  geom_smooth(method = "loess", se = FALSE) +
  scale_linetype_manual(values = c("solid", "longdash", "dotdash", "dotted")) +
  scale_color_manual(values = c("black", "gray50", "gray50", "gray50")) +
  facet_wrap(~ label, labeller = label_parsed) +
  labs(x = "Distance Between Component Means", y = "Estimated Level of Polarization",
       title = TeX("A: Random $\\mu$ (intergroup heterogeneity)")) +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        axis.text = element_text(size = 16),
        plot.title = element_text(size = 22, hjust = -1))

dev.off()

###################################################################################
#Simulations for log-normal distributions (scaled)
###################################################################################

#set seed

set.seed(123)

#generate Gaussian mixture distributions

MC_uni_ranvar <- list()
MC_uni_ranmean <- list()
MC_uni_variances <- c()
MC_uni_means <- c()

for (i in means_log) {
  for (j in 1:1000) {
    variance <- runif(1, min = 0.25, max = 0.75)
    data <- rlnorm(500, i, variance)
    data <- log(data)
    data[501:1000] <- -data[1:500]
    cluster <- matrix(c(rep(1, 500), rep(2, 500)), ncol = 1)
    data <- cbind(data, cluster)
    MC_uni_ranvar[[length(MC_uni_ranvar) + 1]] <- data
    MC_uni_variances[[length(MC_uni_variances) + 1]] <- variance
  }
  base::print(i)
}

for (i in variance_log) {
  for (j in 1:1000) {
    mean <- runif(1, min = 1, max = 3)
    data <- rlnorm(500, mean, i)
    data <- log(data)
    data[501:1000] <- -data[1:500]
    cluster <- matrix(c(rep(1, 500), rep(2, 500)), ncol = 1)
    data <- cbind(data, cluster)
    MC_uni_ranmean[[length(MC_uni_ranmean) + 1]] <- data
    MC_uni_means[[length(MC_uni_means) + 1]] <- mean
  }
  base::print(i)
}

#calculate various polarization scores for simulated distributions

MC_uni_ranvar_diff <- pblapply(MC_uni_ranvar,
                               function(x) diff_multidim(x, cols = 1, clusters = 2), cl = cores)
MC_uni_ranvar_var <- pblapply(MC_uni_ranvar, function(x) var(x[,1]), cl = cores)
MC_uni_ranvar_CPC <- pblapply(MC_uni_ranvar,
                              function(x) CPC(x, "manual", cols = 1, clusters = 2, adjust = TRUE),
                              cl = cores)

MC_uni_ranmean_diff <- pblapply(MC_uni_ranmean,
                                function(x) diff_multidim(x, cols = 1, clusters = 2), cl = cores)
MC_uni_ranmean_var <- pblapply(MC_uni_ranmean, function(x) var(x[,1]), cl = cores)
MC_uni_ranmean_CPC <- pblapply(MC_uni_ranmean,
                               function(x) CPC(x, "manual", cols = 1, clusters = 2, adjust = TRUE),
                               cl = cores)

#combine random variance polarization results into data frames, holding means constant within data frame for accurate rank calculations

MC_uni_ranvar1 <- as.data.frame(matrix(data = c(unlist(MC_uni_variances[1:1000]),
                                                unlist(MC_uni_ranvar_diff[1:1000]),
                                                unlist(MC_uni_ranvar_var[1:1000]),
                                                unlist(MC_uni_ranvar_CPC[1:1000])),
                                       nrow = 1000))
colnames(MC_uni_ranvar1) <- c("ranvar", "difference", "variance", "CPC")

MC_uni_ranvar2 <- as.data.frame(matrix(data = c(unlist(MC_uni_variances[1001:2000]),
                                                unlist(MC_uni_ranvar_diff[1001:2000]),
                                                unlist(MC_uni_ranvar_var[1001:2000]),
                                                unlist(MC_uni_ranvar_CPC[1001:2000])),
                                       nrow = 1000))
colnames(MC_uni_ranvar2) <- c("ranvar", "difference", "variance", "CPC")

MC_uni_ranvar3 <- as.data.frame(matrix(data = c(unlist(MC_uni_variances[2001:3000]),
                                                unlist(MC_uni_ranvar_diff[2001:3000]),
                                                unlist(MC_uni_ranvar_var[2001:3000]),
                                                unlist(MC_uni_ranvar_CPC[2001:3000])),
                                       nrow = 1000))
colnames(MC_uni_ranvar3) <- c("ranvar", "difference", "variance", "CPC")

#combine random mean polarization results into data frames, holding variance constant within data frame for accurate rank calculations

MC_uni_ranmean1 <- as.data.frame(matrix(data = c(unlist(MC_uni_means[1:1000]),
                                                 unlist(MC_uni_ranmean_diff[1:1000]),
                                                 unlist(MC_uni_ranmean_var[1:1000]),
                                                 unlist(MC_uni_ranmean_CPC[1:1000])),
                                        nrow = 1000))
colnames(MC_uni_ranmean1) <- c("ranmean", "difference", "variance", "CPC")

MC_uni_ranmean2 <- as.data.frame(matrix(data = c(unlist(MC_uni_means[1001:2000]),
                                                 unlist(MC_uni_ranmean_diff[1001:2000]),
                                                 unlist(MC_uni_ranmean_var[1001:2000]),
                                                 unlist(MC_uni_ranmean_CPC[1001:2000])),
                                        nrow = 1000))
colnames(MC_uni_ranmean2) <- c("ranmean", "difference", "variance", "CPC")

MC_uni_ranmean3 <- as.data.frame(matrix(data = c(unlist(MC_uni_means[2001:3000]),
                                                 unlist(MC_uni_ranmean_diff[2001:3000]),
                                                 unlist(MC_uni_ranmean_var[2001:3000]),
                                                 unlist(MC_uni_ranmean_CPC[2001:3000])),
                                        nrow = 1000))
colnames(MC_uni_ranmean3) <- c("ranmean", "difference", "variance", "CPC")

#merge data sets together for plotting

MC_uni_ranvar_all <- rbind(MC_uni_ranvar1, MC_uni_ranvar2, MC_uni_ranvar3)
MC_uni_ranmean_all <- rbind(MC_uni_ranmean1, MC_uni_ranmean2, MC_uni_ranmean3)

#add means and standard deviation labels and scale measures

ranvar_labels <- c(rep(c("means = (-e^3, e^3)", "means = (-e^2, e^2)", "means = (-e, e)"),
                       each = 1000))
ranmean_labels <- c(rep(c("sd = e^0.25", "sd = e^0.5", "sd = e^0.75"), each = 1000))

ranvar_labels <- factor(ranvar_labels, levels = c("means = (-e, e)", "means = (-e^2, e^2)",
                                                  "means = (-e^3, e^3)"))
ranmean_labels <- factor(ranmean_labels, levels = c("sd = e^0.25", "sd = e^0.5", "sd = e^0.75"))

levels(ranvar_labels) <- c("means = (-e, e)" = TeX("$\\mu = \\{-e, e\\}$"),
                           "means = (-e^2, e^2)" = TeX("$\\mu = \\{-e^2, e^2\\}$"),
                           "means = (-e^3, e^3)" = TeX("$\\mu = \\{-e^3, e^3\\}$"))
levels(ranmean_labels) <- c("sd = e^0.25" = TeX("$\\sigma = e^\\frac{1}{4}$"),
                            "sd = e^0.5" = TeX("$\\sigma = e^\\frac{1}{2}$"),
                            "sd = e^0.75" = TeX("$\\sigma = e^\\frac{3}{4}$"))

MC_uni_ranvar_all <- cbind(MC_uni_ranvar_all, label = ranvar_labels)
MC_uni_ranmean_all <- cbind(MC_uni_ranmean_all, label = ranmean_labels)

MC_uni_ranvar_all <- mutate(MC_uni_ranvar_all,
                            difference = scales::rescale(difference, to = c(0, 1)),
                            variance = scales::rescale(variance, to = c(0, 1)),
                            CPC = scales::rescale(CPC, to = c(0, 1))) %>%
  pivot_longer(cols = 2:4, names_to = "Measure", values_to = "value") %>%
  mutate(Measure = ifelse(Measure == "difference", "Difference",
                          ifelse(Measure == "variance", "Variance", Measure)))

MC_uni_ranmean_all <- mutate(MC_uni_ranmean_all, ranmean = ranmean*2,
                             difference = scales::rescale(difference, to = c(0, 1)),
                             variance = scales::rescale(variance, to = c(0, 1)),
                             CPC = scales::rescale(CPC, to = c(0, 1))) %>%
  pivot_longer(cols = 2:4, names_to = "Measure", values_to = "value") %>%
  mutate(Measure = ifelse(Measure == "difference", "Difference",
                          ifelse(Measure == "variance", "Variance", Measure)))

#plot simulation results (Figure S5)

pdf("figureS5B.pdf", width = 9, height = 7)

ggplot(MC_uni_ranvar_all, aes(x = ranvar, y = value, linetype = Measure, color = Measure)) +
  geom_smooth(method = "loess", se = FALSE) +
  scale_linetype_manual(values = c("solid", "longdash", "dotdash", "dotted")) +
  scale_color_manual(values = c("black", "gray50", "gray50", "gray50")) +
  facet_wrap(~ label, labeller = label_parsed) +
  labs(x = "Component Standard Deviations", y = "Estimated Level of Polarization",
       title = TeX("B: Random $\\sigma$ (intragroup homogeneity)")) +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        axis.text = element_text(size = 16),
        plot.title = element_text(size = 22, hjust = -0.9))

dev.off()

pdf("figureS5A.pdf", width = 9, height = 7)

ggplot(MC_uni_ranmean_all, aes(x = ranmean, y = value, linetype = Measure, color = Measure)) +
  geom_smooth(method = "loess", se = FALSE) +
  scale_linetype_manual(values = c("solid", "longdash", "dotdash", "dotted")) +
  scale_color_manual(values = c("black", "gray50", "gray50", "gray50")) +
  facet_wrap(~ label, labeller = label_parsed) +
  labs(x = "Distance Between Component Means", y = "Estimated Level of Polarization",
       title = TeX("A: Random $\\mu$ (intergroup heterogeneity)")) +
  theme(strip.background = element_rect(color = "white", fill = "white"),
        axis.text = element_text(size = 16),
        plot.title = element_text(size = 22, hjust = -1))

dev.off()
